-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdataset_splitter.py
78 lines (68 loc) · 2.7 KB
/
dataset_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#split_train_test.py
import os
import shutil
import random
import numpy as np
src_folder_path = '../data/pdfs/TS_MET_notes/ALL/'
out_sets = [
'../data/pdfs/TS_MET_notes/SET0/',
'../data/pdfs/TS_MET_notes/SET1/',
'../data/pdfs/TS_MET_notes/SET2/',
'../data/pdfs/TS_MET_notes/SET3/',
'../data/pdfs/TS_MET_notes/ARCHIVE/'
]
out_set = '../data/pdfs/TS_MET_notes/REDO/'
# for file_path in out_sets:
# if os.path.isdir(file_path):
# shutil.rmtree(file_path)
# os.mkdir(out_set)
src_file_names = os.listdir(src_folder_path)
redo_files = [
]
# random.shuffle(src_file_names)
for file_name in src_file_names:
# print()
if file_name.split('_')[1] in redo_files:
print(file_name.split('_')[1])
shutil.copy(src_folder_path+file_name, out_set + file_name)
# cnt = 1
# for file_name in src_file_names:
# if not os.path.exists(out_set + file_name):
# print (cnt)
# cnt+=1
# shutil.copy(src_folder_path+file_name, out_set + file_name)
# print(src_file_names)
# patient_ids = [f.split("_")[0] for f in src_file_names]
# note_date = [int((f.split("_")[1])[:8]) for f in src_file_names]
# print(note_date)
# cnt = 0
# set0_patients = []
# for i in range(len(src_file_names)):
# if cnt < 50 and note_date[i] > 20180000 and patient_ids[i] not in set0_patients:
# print (cnt)
# cnt += 1
# set0_patients.append(patient_ids[i])
# shutil.copy(src_folder_path+src_file_names[i], out_sets[0] +src_file_names[i])
# elif cnt < 100 and note_date[i] > 20180000 and patient_ids[i] not in set0_patients:
# print (cnt)
# cnt += 1
# set0_patients.append(patient_ids[i])
# shutil.copy(src_folder_path+src_file_names[i], out_sets[1] +src_file_names[i])
# elif cnt < 150 and note_date[i] > 20180000 and patient_ids[i] not in set0_patients:
# print (cnt)
# cnt += 1
# set0_patients.append(patient_ids[i])
# shutil.copy(src_folder_path+src_file_names[i], out_sets[2] +src_file_names[i])
# elif cnt < 300 and note_date[i] > 20180000 and patient_ids[i] not in set0_patients:
# print (cnt)
# cnt += 1
# set0_patients.append(patient_ids[i])
# shutil.copy(src_folder_path+src_file_names[i], out_sets[3] +src_file_names[i])
# else:
# # print (cnt)
# # set0_patients.append(patient_ids[i])
# shutil.copy(src_folder_path+src_file_names[i], out_sets[4] +src_file_names[i])
# # for shuffle_file_name in shuffle_file_names[:50]:
# # shutil.move(src_folder_path+shuffle_file_name, out_0_folder_path +shuffle_file_name)
# # for shuffle_file_name in shuffle_file_names[50:100]:
# # shutil.move(src_folder_path+shuffle_file_name, out_1_folder_path +shuffle_file_name)