diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c6dcdaa..ee19176 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -44,5 +44,9 @@ jobs: python -m pip install pytest python -m pip install -e . - name: Run tests - run: pytest + run: | + cd tests + python bootstrap.py + cd .. + pytest diff --git a/tests/NIH_test_data/folder/00000001_000.png b/tests/NIH_test_data/folder/00000001_000.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000001_000.png differ diff --git a/tests/NIH_test_data/folder/00000002_000.png b/tests/NIH_test_data/folder/00000002_000.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000002_000.png differ diff --git a/tests/NIH_test_data/folder/00000003_001.png b/tests/NIH_test_data/folder/00000003_001.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000003_001.png differ diff --git a/tests/NIH_test_data/folder/00000005_000.png b/tests/NIH_test_data/folder/00000005_000.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000005_000.png differ diff --git a/tests/NIH_test_data/folder/00000006_000.png b/tests/NIH_test_data/folder/00000006_000.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000006_000.png differ diff --git a/tests/NIH_test_data/folder/00000007_000.png b/tests/NIH_test_data/folder/00000007_000.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000007_000.png differ diff --git a/tests/NIH_test_data/folder/00000008_000.png b/tests/NIH_test_data/folder/00000008_000.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000008_000.png differ diff --git a/tests/NIH_test_data/folder/00000009_000.png b/tests/NIH_test_data/folder/00000009_000.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000009_000.png differ diff --git a/tests/NIH_test_data/folder/00000010_000.png b/tests/NIH_test_data/folder/00000010_000.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000010_000.png differ diff --git a/tests/NIH_test_data/folder/00000011_000.png b/tests/NIH_test_data/folder/00000011_000.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/NIH_test_data/folder/00000011_000.png differ diff --git a/tests/NIH_test_data/tar.tar b/tests/NIH_test_data/tar.tar new file mode 100644 index 0000000..12ffa33 Binary files /dev/null and b/tests/NIH_test_data/tar.tar differ diff --git a/tests/NIH_test_data/zip.zip b/tests/NIH_test_data/zip.zip new file mode 100644 index 0000000..6b85cba Binary files /dev/null and b/tests/NIH_test_data/zip.zip differ diff --git a/tests/PC_test_data/folder/117677712752528732526839762067921423608_ag14ef.png b/tests/PC_test_data/folder/117677712752528732526839762067921423608_ag14ef.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/117677712752528732526839762067921423608_ag14ef.png differ diff --git a/tests/PC_test_data/folder/125374151943505747025890313053997514922_j5rk5q.png b/tests/PC_test_data/folder/125374151943505747025890313053997514922_j5rk5q.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/125374151943505747025890313053997514922_j5rk5q.png differ diff --git a/tests/PC_test_data/folder/12986792586879524468475227325378905558_f8k8bq.png b/tests/PC_test_data/folder/12986792586879524468475227325378905558_f8k8bq.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/12986792586879524468475227325378905558_f8k8bq.png differ diff --git a/tests/PC_test_data/folder/216840111366964013076187734852011294130558067_00-193-087.png b/tests/PC_test_data/folder/216840111366964013076187734852011294130558067_00-193-087.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/216840111366964013076187734852011294130558067_00-193-087.png differ diff --git a/tests/PC_test_data/folder/216840111366964013199786354762011304090209759_01-001-141.png b/tests/PC_test_data/folder/216840111366964013199786354762011304090209759_01-001-141.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/216840111366964013199786354762011304090209759_01-001-141.png differ diff --git a/tests/PC_test_data/folder/216840111366964013534861372972012353112932019_01-129-187.png b/tests/PC_test_data/folder/216840111366964013534861372972012353112932019_01-129-187.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/216840111366964013534861372972012353112932019_01-129-187.png differ diff --git a/tests/PC_test_data/folder/216840111366964013663026955732013130130708697_02-042-053.png b/tests/PC_test_data/folder/216840111366964013663026955732013130130708697_02-042-053.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/216840111366964013663026955732013130130708697_02-042-053.png differ diff --git a/tests/PC_test_data/folder/300276821619038978339766780609910437168_fgj94z.png b/tests/PC_test_data/folder/300276821619038978339766780609910437168_fgj94z.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/300276821619038978339766780609910437168_fgj94z.png differ diff --git a/tests/PC_test_data/folder/318099584815157573909341904878106290504_erj1pn.png b/tests/PC_test_data/folder/318099584815157573909341904878106290504_erj1pn.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/318099584815157573909341904878106290504_erj1pn.png differ diff --git a/tests/PC_test_data/folder/336326204749794028912525116839151974960_vsaz23.png b/tests/PC_test_data/folder/336326204749794028912525116839151974960_vsaz23.png new file mode 100644 index 0000000..67f5710 Binary files /dev/null and b/tests/PC_test_data/folder/336326204749794028912525116839151974960_vsaz23.png differ diff --git a/tests/PC_test_data/tar.tar b/tests/PC_test_data/tar.tar new file mode 100644 index 0000000..c229700 Binary files /dev/null and b/tests/PC_test_data/tar.tar differ diff --git a/tests/PC_test_data/zip.zip b/tests/PC_test_data/zip.zip new file mode 100644 index 0000000..0bb9d89 Binary files /dev/null and b/tests/PC_test_data/zip.zip differ diff --git a/tests/bootstrap.py b/tests/bootstrap.py new file mode 100644 index 0000000..54e2cba --- /dev/null +++ b/tests/bootstrap.py @@ -0,0 +1,154 @@ +#bash generate_all.sh +from create_standard_test_output import create_standard_test_output +from gen_mimic import generate_mimic_test_data +from gen_chexpert import gen_chexpert +import torchxrayvision as xrv +from generate_test_data import generate_test_data +import pandas as pd + +n = 10 + +def create_csv_files(n): + print("nih") + create_standard_test_output( + xrv.datasets.NIH_Dataset(imgpath="."), + "nih.csv", + n=n + ) + + print("pc") + create_standard_test_output( + xrv.datasets.PC_Dataset(imgpath="."), + "pc.csv", + n=n + ) + + print("openi") + openi = xrv.datasets.Openi_Dataset(imgpath=".") + create_standard_test_output( + openi, + "openi.csv", + columns=list(openi.dicom_metadata.columns) + ["imageid"], + n=n + ) + + #RSNA train + print("rsna (just train data)") + rsna = xrv.datasets.RSNA_Pneumonia_Dataset(imgpath=".") + create_standard_test_output( + xrv.datasets.RSNA_Pneumonia_Dataset(imgpath="."), + "rsna_train.csv", + columns=list(rsna.raw_csv.columns) + ["patientId"], + n=n + ) + + #Chexpert + gen_chexpert(n, "test_chexpert_data.csv") + + #No data is generated for the COVID19, NLM_TB (Shenzen) or + #NLM_TB (Montgomery) datasets. + +def create_images(): + + #python3 generate_test_data.py pc.csv ImageID 2 2 PC_test_data + generate_test_data( + pd.read_csv("pc.csv"), + "ImageID", + (2, 2), + "PC_test_data", + "", + "." + ) + + #python3 generate_test_data.py nih.csv "Image Index" 2 2 NIH_test_data + generate_test_data( + pd.read_csv("nih.csv"), + "Image Index", + (2, 2), + "NIH_test_data", + "", + "." + ) + + #python3 generate_test_data.py openi.csv imageid 2 2 Openi_test_data .png + generate_test_data( + pd.read_csv("openi.csv"), + "imageid", + (2, 2), + "Openi_test_data", + ".png", + "." + ) + + #python3 generate_test_data.py shenzen.csv fname 2 2 Shenzen_test_data --subfolder CXR_png + generate_test_data( + pd.read_csv("shenzen.csv"), + "fname", + (2, 2), + "Shenzen_test_data", + "", + "CXR_png" + ) + + #python3 generate_test_data.py montgomery.csv fname 2 2 Montgomery_test_data --subfolder CXR_png + generate_test_data( + pd.read_csv("montgomery.csv"), + "fname", + (2, 2), + "Montgomery_test_data", + "", + "CXR_png" + ) + + #python3 generate_test_data.py rsna_train.csv patientId 2 2 RSNA_test_data_jpg .jpg --subfolder stage_2_train_images + generate_test_data( + pd.read_csv("rsna_train.csv"), + "patientId", + (2, 2), + "RSNA_test_data_jpg", + ".jpg", + "stage_2_train_images" + ) + + #python3 generate_test_data.py rsna_train.csv patientId 2 2 RSNA_test_data_dcm .dcm --subfolder stage_2_train_images + generate_test_data( + pd.read_csv("rsna_train.csv"), + "patientId", + (2, 2), + "RSNA_test_data_dcm", + ".dcm", + "stage_2_train_images" + ) + + #python3 generate_test_data.py test_chexpert_data.csv Path 2 2 CheXpert_test_data + generate_test_data( + pd.read_csv("test_chexpert_data.csv"), + "Path", + (2, 2), + "CheXpert_test_data", + "", + "." + ) + + #python3 generate_test_data.py test_covid_data.csv filename 2 2 COVID_test_data --subfolder images + generate_test_data( + pd.read_csv("test_covid_data.csv"), + "filename", + (2, 2), + "COVID_test_data", + "", + "." + ) + +def bootstrap_test_cases(n): + create_csv_files(n) + create_images() + #MIMIC is handled differently due to its unique structure + generate_mimic_test_data( + n, + directory = "gen_mimic", + dimensions = (2, 2) + ) + +if __name__ == "__main__": + bootstrap_test_cases(10) diff --git a/tests/create_standard_test_output.py b/tests/create_standard_test_output.py new file mode 100644 index 0000000..369e3b1 --- /dev/null +++ b/tests/create_standard_test_output.py @@ -0,0 +1,8 @@ +import torchxrayvision as xrv +import os + +def create_standard_test_output(dataset, name, columns = None, n=10): + minimal_dataset = dataset.csv[:n] + if columns is not None: + minimal_dataset = minimal_dataset[columns] + minimal_dataset.to_csv(name, index=False) diff --git a/tests/gen_chexpert.py b/tests/gen_chexpert.py new file mode 100644 index 0000000..3e0d3ba --- /dev/null +++ b/tests/gen_chexpert.py @@ -0,0 +1,68 @@ +from random_data import random_preds +import numpy as np +import pandas as pd +import random + +n = 10 + +def probability(fraction): + gran = 100 + return np.random.randint(0, gran) < gran * fraction + +def random_age(): + return random.randint(18, 100) #Assume adult + +def gen_random_study(): + views = [] + if probability(.7): + views.append({ + "Path":"view1_frontal.jpg", + "AP/PA":np.random.choice(["AP","PA"]), + "Frontal/Lateral":"Frontal" + }) + if probability(.3): + views.append({ + "Path":"view1_lateral.jpg", + "AP/PA":"", + "Frontal/Lateral":"Lateral" + }) + return views + +def gen_random_data(): + views = [] + patient_idx = 0 + while True: + n_studies = random.randint(1, 4) + sex = random.choice(["Female","Male"]) + age = random_age() + for study_idx in range(n_studies): + for radiograph in gen_random_study(): + path_format = ( + "CheXpert-v1.0-small/train/patient{}/study{}/{}" + ) + path = path_format.format( + str(patient_idx).zfill(5), + str(study_idx), + radiograph["Path"] + ) + radiograph.update({ + "Path":path, + "Sex":sex, + "Age":age, + }) + radiograph.update(random_preds()) + yield radiograph + patient_idx += 1 + +#Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices +#CheXpert-v1.0-small/train/patient00001/study1/view1_frontal.jpg,Female,68,Frontal,AP,1.0,,,,,,,,,0.0,,,,1.0 + +def gen_random_rows(nrows): + rows = [] + random_data_source = gen_random_data() + for i in range(nrows): + rows.append(next(random_data_source)) + return pd.DataFrame(rows) + +def gen_chexpert(n, filename): + gen_random_rows(n).to_csv(filename) diff --git a/tests/gen_mimic.py b/tests/gen_mimic.py new file mode 100644 index 0000000..e45e8e0 --- /dev/null +++ b/tests/gen_mimic.py @@ -0,0 +1,139 @@ +import numpy as np +import pdb +import tarfile +import pandas as pd +from PIL import Image +import random +import argparse +from pathlib import Path +import os + +from random_data import write_random_images, gen_int, gen_hex, random_pred, random_preds + +def show(x): + print(x) + return x + +mimic_metadata_filename = "mimic-cxr-2.0.0-metadata.csv" +mimic_csvdata_filename = "mimic-cxr-2.0.0-negbio.csv" + +def generate_random_metadata(n, dimensions): + columns = "dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning".split(",") + performed_procedure_step_descriptions = { + "CHEST (PA AND LAT)":{ + "n_views":2, + "view_position":["LATERAL", "PA"], + "procedure_code_meaning":"CHEST (PA AND LAT)", + "view_code_meaning":["lateral", "postero-anterior"], + "orientation_code_meaning":["Erect","Recumbent"] + } + } + + def generate_random_row(dimensions): + performed_procedure_step_description = random.choice( + list(performed_procedure_step_descriptions) + ) + procedure = performed_procedure_step_descriptions[performed_procedure_step_description] + n_views = procedure["n_views"] + view_index = random.randint(0, n_views - 1) + view_position = procedure["view_position"][view_index] + procedure_code_meaning = procedure["procedure_code_meaning"] + view_code_meaning = procedure["view_code_meaning"][view_index] + #Currently unsure how/if view codes are mapped to orientations + orientation_code_meaning = random.choice(procedure["orientation_code_meaning"]) + subject_id = gen_int(8) + study_id = gen_int(8) + meta_row = { + "dicom_id":"-".join([gen_hex(8) for i in range(4)]), + "subject_id":subject_id, + "study_id":study_id, + "PerformedProcedureStepDescription":performed_procedure_step_description, + "ViewPosition":view_position, + "Rows":dimensions[0], + "Columns":dimensions[1], + "StudyDate":0, + "StudyTime":0, + "ProcedureCodeSequence_CodeMeaning":procedure_code_meaning, + "ViewCodeSequence_CodeMeaning":view_code_meaning, + "PatientOrientationCodeSequence_CodeMeaning":orientation_code_meaning + } + + csv_row = { + "subject_id":subject_id, + "study_id":study_id, + } + csv_row.update(random_preds()) + return meta_row, csv_row + + meta_rows, csv_rows = show(list(zip(*show([generate_random_row(dimensions) for i in range(n)])))) + + return pd.DataFrame(meta_rows), pd.DataFrame(csv_rows) + + + +def generate_test_images(random_metadata, extracted, tarname, zipname, folder_of_zip_name, folder_of_tar_gz_name, dimensions): + paths = [] + for _, row in random_metadata.iterrows(): + subjectid = row["subject_id"] + studyid = row["study_id"] + dicom_id = row["dicom_id"] + img_fname = os.path.join("p" + subjectid[:2], "p" + subjectid, "s" + studyid, dicom_id + ".jpg") + paths.append(Path("files")/img_fname) + write_random_images(paths, extracted, tarname, zipname, folder_of_zip_name, folder_of_tar_gz_name, dimensions) + +def generate_mimic_test_data(n, directory, dimensions=(224, 224), tarname=None, zipname=None, folder_of_zip_name=None, folder_of_tar_gz_name = None, extracted=None): + directory = Path(directory) + if tarname is None: + tarname = directory/"images-224.tar" + if zipname is None: + zipname = directory/"images-224.zip" + if extracted is None: + extracted = directory/"images-224" + if folder_of_zip_name is None: + folder_of_zip_name = directory/"images-224-zips" + if folder_of_tar_gz_name is None: + folder_of_tar_gz_name = directory/"images-224-tgzs" + random_metadata, random_csvdata = generate_random_metadata( + n, + dimensions + ) + generate_test_images(random_metadata, extracted, tarname, zipname, folder_of_zip_name, folder_of_tar_gz_name, dimensions) + random_metadata.to_csv( + directory/mimic_metadata_filename, + index=False + ) + random_metadata.to_csv( + directory/(mimic_metadata_filename+".gz"), + compression="gzip", + index=False + ) + random_csvdata.to_csv( + directory/mimic_csvdata_filename, + index=False + ) + random_csvdata.to_csv( + directory/(mimic_csvdata_filename+".gz"), + compression="gzip", + index=False + ) + +#./images-224/files/p17/p17387118/s56770356/b983f94c-b77ad35d-8a4aa372-2faf6503-5ec94835.jpg + +#if __name__ == "__main__": +# parser = argparse.ArgumentParser() +# parser.add_argument("n") +# parser.add_argument("directory") +# parser.add_argument("x") +# parser.add_argument("y") +# parser.add_argument("tarfile", default=None, nargs="?") +# parser.add_argument("extracted", default=None, nargs="?") +# args = parser.parse_args() +# generate_test_data( +# n=int(args.n), +# directory = args.directory, +# dimensions = (int(args.x), int(args.y)), +# tarname = args.tarfile, +# extracted = args.extracted +# ) + +#python3 gen_mimic.py 10 gen_mimic 224 224 diff --git a/tests/gen_mimic/images-224.tar b/tests/gen_mimic/images-224.tar new file mode 100644 index 0000000..1f774a5 Binary files /dev/null and b/tests/gen_mimic/images-224.tar differ diff --git a/tests/gen_mimic/images-224.zip b/tests/gen_mimic/images-224.zip new file mode 100644 index 0000000..02f012b Binary files /dev/null and b/tests/gen_mimic/images-224.zip differ diff --git a/tests/gen_mimic/images-224/files/p11/p11a8f57f/sbf2ecf26/9622b5f2-3a3e307e-f5ca9164-375a6f70.jpg b/tests/gen_mimic/images-224/files/p11/p11a8f57f/sbf2ecf26/9622b5f2-3a3e307e-f5ca9164-375a6f70.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/p11/p11a8f57f/sbf2ecf26/9622b5f2-3a3e307e-f5ca9164-375a6f70.jpg differ diff --git a/tests/gen_mimic/images-224/files/p31/p310345ea/s274bcf57/7e393cbe-6eac27c9-469708b4-dc5f22ef.jpg b/tests/gen_mimic/images-224/files/p31/p310345ea/s274bcf57/7e393cbe-6eac27c9-469708b4-dc5f22ef.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/p31/p310345ea/s274bcf57/7e393cbe-6eac27c9-469708b4-dc5f22ef.jpg differ diff --git a/tests/gen_mimic/images-224/files/p34/p346b3f01/s14b48a44/dc7a8b5e-4ad1afd8-8f994433-643c39f8.jpg b/tests/gen_mimic/images-224/files/p34/p346b3f01/s14b48a44/dc7a8b5e-4ad1afd8-8f994433-643c39f8.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/p34/p346b3f01/s14b48a44/dc7a8b5e-4ad1afd8-8f994433-643c39f8.jpg differ diff --git a/tests/gen_mimic/images-224/files/p3b/p3b3b7d36/sf076c36f/b69fd4aa-55745241-f15af2bb-b979004a.jpg b/tests/gen_mimic/images-224/files/p3b/p3b3b7d36/sf076c36f/b69fd4aa-55745241-f15af2bb-b979004a.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/p3b/p3b3b7d36/sf076c36f/b69fd4aa-55745241-f15af2bb-b979004a.jpg differ diff --git a/tests/gen_mimic/images-224/files/p3d/p3d404d22/s1e752e16/4197fbbc-1ed2f09e-8a82308b-971009a3.jpg b/tests/gen_mimic/images-224/files/p3d/p3d404d22/s1e752e16/4197fbbc-1ed2f09e-8a82308b-971009a3.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/p3d/p3d404d22/s1e752e16/4197fbbc-1ed2f09e-8a82308b-971009a3.jpg differ diff --git a/tests/gen_mimic/images-224/files/p4e/p4eedf9fe/sf8521632/8cf7e7b2-46bdc39f-31564b20-71ee6b6d.jpg b/tests/gen_mimic/images-224/files/p4e/p4eedf9fe/sf8521632/8cf7e7b2-46bdc39f-31564b20-71ee6b6d.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/p4e/p4eedf9fe/sf8521632/8cf7e7b2-46bdc39f-31564b20-71ee6b6d.jpg differ diff --git a/tests/gen_mimic/images-224/files/p54/p54e08d2a/s1c41417d/67dc7d13-32da76c2-ff22a175-5d4c3ced.jpg b/tests/gen_mimic/images-224/files/p54/p54e08d2a/s1c41417d/67dc7d13-32da76c2-ff22a175-5d4c3ced.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/p54/p54e08d2a/s1c41417d/67dc7d13-32da76c2-ff22a175-5d4c3ced.jpg differ diff --git a/tests/gen_mimic/images-224/files/p61/p6145fd64/s7968e508/baf27490-fe6678a0-fe0b7379-665d78ba.jpg b/tests/gen_mimic/images-224/files/p61/p6145fd64/s7968e508/baf27490-fe6678a0-fe0b7379-665d78ba.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/p61/p6145fd64/s7968e508/baf27490-fe6678a0-fe0b7379-665d78ba.jpg differ diff --git a/tests/gen_mimic/images-224/files/pbc/pbc0ee611/s89cfed09/6a301286-456d8528-c59bd118-195ce98c.jpg b/tests/gen_mimic/images-224/files/pbc/pbc0ee611/s89cfed09/6a301286-456d8528-c59bd118-195ce98c.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/pbc/pbc0ee611/s89cfed09/6a301286-456d8528-c59bd118-195ce98c.jpg differ diff --git a/tests/gen_mimic/images-224/files/pbf/pbfb2d1b6/s2c5d98ee/a0082cb3-93689ac6-4fbbad4e-ddd68866.jpg b/tests/gen_mimic/images-224/files/pbf/pbfb2d1b6/s2c5d98ee/a0082cb3-93689ac6-4fbbad4e-ddd68866.jpg new file mode 100644 index 0000000..ee20271 Binary files /dev/null and b/tests/gen_mimic/images-224/files/pbf/pbfb2d1b6/s2c5d98ee/a0082cb3-93689ac6-4fbbad4e-ddd68866.jpg differ diff --git a/tests/gen_mimic/mimic-cxr-2.0.0-metadata.csv b/tests/gen_mimic/mimic-cxr-2.0.0-metadata.csv new file mode 100644 index 0000000..ebd862d --- /dev/null +++ b/tests/gen_mimic/mimic-cxr-2.0.0-metadata.csv @@ -0,0 +1,11 @@ +dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning +dc7a8b5e-4ad1afd8-8f994433-643c39f8,346b3f01,14b48a44,CHEST (PA AND LAT),LATERAL,224,224,0,0,CHEST (PA AND LAT),lateral,Recumbent +9622b5f2-3a3e307e-f5ca9164-375a6f70,11a8f57f,bf2ecf26,CHEST (PA AND LAT),PA,224,224,0,0,CHEST (PA AND LAT),postero-anterior,Erect +8cf7e7b2-46bdc39f-31564b20-71ee6b6d,4eedf9fe,f8521632,CHEST (PA AND LAT),PA,224,224,0,0,CHEST (PA AND LAT),postero-anterior,Erect +6a301286-456d8528-c59bd118-195ce98c,bc0ee611,89cfed09,CHEST (PA AND LAT),LATERAL,224,224,0,0,CHEST (PA AND LAT),lateral,Erect +67dc7d13-32da76c2-ff22a175-5d4c3ced,54e08d2a,1c41417d,CHEST (PA AND LAT),PA,224,224,0,0,CHEST (PA AND LAT),postero-anterior,Erect +b69fd4aa-55745241-f15af2bb-b979004a,3b3b7d36,f076c36f,CHEST (PA AND LAT),PA,224,224,0,0,CHEST (PA AND LAT),postero-anterior,Erect +7e393cbe-6eac27c9-469708b4-dc5f22ef,310345ea,274bcf57,CHEST (PA AND LAT),LATERAL,224,224,0,0,CHEST (PA AND LAT),lateral,Erect +a0082cb3-93689ac6-4fbbad4e-ddd68866,bfb2d1b6,2c5d98ee,CHEST (PA AND LAT),LATERAL,224,224,0,0,CHEST (PA AND LAT),lateral,Recumbent +4197fbbc-1ed2f09e-8a82308b-971009a3,3d404d22,1e752e16,CHEST (PA AND LAT),LATERAL,224,224,0,0,CHEST (PA AND LAT),lateral,Erect +baf27490-fe6678a0-fe0b7379-665d78ba,6145fd64,7968e508,CHEST (PA AND LAT),PA,224,224,0,0,CHEST (PA AND LAT),postero-anterior,Recumbent diff --git a/tests/gen_mimic/mimic-cxr-2.0.0-metadata.csv.gz b/tests/gen_mimic/mimic-cxr-2.0.0-metadata.csv.gz new file mode 100644 index 0000000..f18c4b5 Binary files /dev/null and b/tests/gen_mimic/mimic-cxr-2.0.0-metadata.csv.gz differ diff --git a/tests/gen_mimic/mimic-cxr-2.0.0-negbio.csv b/tests/gen_mimic/mimic-cxr-2.0.0-negbio.csv new file mode 100644 index 0000000..e515991 --- /dev/null +++ b/tests/gen_mimic/mimic-cxr-2.0.0-negbio.csv @@ -0,0 +1,11 @@ +subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices +346b3f01,14b48a44,-1.0,-1.0,0.0,0.0,-1.0,-1.0,,-1.0,0.0,,,,, +11a8f57f,bf2ecf26,1.0,1.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,0.0,1.0,1.0,-1.0,,1.0 +4eedf9fe,f8521632,,,1.0,0.0,1.0,1.0,,,,1.0,0.0,-1.0,-1.0, +bc0ee611,89cfed09,0.0,,0.0,-1.0,-1.0,1.0,1.0,1.0,,0.0,-1.0,0.0,-1.0,0.0 +54e08d2a,1c41417d,1.0,-1.0,0.0,,,-1.0,-1.0,1.0,-1.0,-1.0,,,1.0,0.0 +3b3b7d36,f076c36f,,0.0,0.0,0.0,,1.0,0.0,0.0,0.0,-1.0,,,1.0,0.0 +310345ea,274bcf57,,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.0 +bfb2d1b6,2c5d98ee,-1.0,,,,,,1.0,,1.0,0.0,0.0,0.0,-1.0,-1.0 +3d404d22,1e752e16,0.0,0.0,1.0,-1.0,0.0,-1.0,1.0,1.0,0.0,-1.0,-1.0,1.0,-1.0,1.0 +6145fd64,7968e508,,-1.0,0.0,,-1.0,1.0,1.0,0.0,1.0,1.0,,1.0,, diff --git a/tests/gen_mimic/mimic-cxr-2.0.0-negbio.csv.gz b/tests/gen_mimic/mimic-cxr-2.0.0-negbio.csv.gz new file mode 100644 index 0000000..8c390f9 Binary files /dev/null and b/tests/gen_mimic/mimic-cxr-2.0.0-negbio.csv.gz differ diff --git a/tests/gen_rsna.py b/tests/gen_rsna.py new file mode 100644 index 0000000..7afad86 --- /dev/null +++ b/tests/gen_rsna.py @@ -0,0 +1,36 @@ +from argparse import ArgumentParser +from random_data import write_random_images +import pandas as pd + +rsna_imgid_column = "patientId" + +extension = ".jpg" + +def gen_rsna(test_csv, train_csv, test_data_folder, dimensions): + train_data = pd.read_csv(train_csv) + test_data = pd.read_csv(test_csv) + write_random_images( + pd.concat([ + train_data[rsna_imgid_column].map(lambda path: os.path.join(train_folder, path)), + test_data[rsna_imgid_column].map(lambda path: os.path.join(train_folder, path)) + ]) + extension, + test_data_folder/"folder", + test_data_folder/"tar.tar", + test_data_folder/"zip.zip", + dimensions + ) + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("train") + parser.add_argument("test") + parser.add_argument("test_data_folder") + parser.add_argument("x") + parser.add_argument("y") + args = parser.parse_args() + gen_rsna( + args.test, + args.train, + args.test_data_folder, + (int(args.x), int(args.y)) + ) diff --git a/tests/generate_all.sh b/tests/generate_all.sh new file mode 100644 index 0000000..001d6f1 --- /dev/null +++ b/tests/generate_all.sh @@ -0,0 +1,3 @@ +python gen_mimic.py +python gen_chexpert.py +python3 generate_test_data.py diff --git a/tests/generate_test_data.py b/tests/generate_test_data.py new file mode 100644 index 0000000..b70649c --- /dev/null +++ b/tests/generate_test_data.py @@ -0,0 +1,36 @@ +from pathlib import Path +import pandas as pd +from random_data import write_random_images +import argparse + +def generate_test_data(metadata_file, filename_column, size, test_data_folder, filename_suffix, subfolder=None): + test_data_folder = Path(test_data_folder) + write_random_images( + metadata_file[filename_column] + filename_suffix, + test_data_folder/"folder", + test_data_folder/"tar.tar", + test_data_folder/"zip.zip", + test_data_folder/"zipped", + test_data_folder/"tgz", + size, + subfolder=Path(subfolder) + ) + +#if __name__ == "__main__": +# parser = argparse.ArgumentParser() +# parser.add_argument("metadata_file") +# parser.add_argument("filename_column") +# parser.add_argument("x") +# parser.add_argument("y") +# parser.add_argument("test_data_folder") +# parser.add_argument("suffix", nargs="*", default="") +# parser.add_argument("--subfolder", dest="subfolder", nargs="?", default=".") +# args = parser.parse_args() +# generate_test_data( +# pd.read_csv(args.metadata_file), +# args.filename_column, +# (int(args.x), int(args.y)), +# args.test_data_folder, +# args.suffix, +# args.subfolder +# ) diff --git a/tests/generate_tests.py b/tests/generate_tests.py new file mode 100644 index 0000000..56b1724 --- /dev/null +++ b/tests/generate_tests.py @@ -0,0 +1,32 @@ +import torchxrayvision as xrv + +with open("test_indices") as handle: + indices = [int(line) for line in handle] + +create_tests_for = [ + (xrv.datasets.CheX_Dataset, + {"imgpath":"/network/tmp1/paul.morrison/network/CheXpert-v1.0-small.zip", + "csvpath":} + ), + (xrv.datasets.MIMIC_Dataset, + {} + ), + (xrv.datasets.NIH_Dataset, + ), + (xrv.datasets.NIH_Google_Dataset, + ), + (xrv.datasets.NLMTB_Dataset, + ), + (xrv.datasets.Openi_Dataset, + ), + (xrv.datasets.PC_Dataset, + ), + (xrv.datasets.RSNA_Pneumonia_Dataset, + ), + (xrv.datasets.COVID19_Dataset, + ) +] + +for dataset in create_tests_for: + print(dataset.__name__) + print(dataset().csv[:10]) diff --git a/tests/montgomery.csv b/tests/montgomery.csv new file mode 100644 index 0000000..0ceab0b --- /dev/null +++ b/tests/montgomery.csv @@ -0,0 +1,11 @@ +fname,label +CXR_png/MCUCXR_0001_0.png,0 +CXR_png/MCUCXR_0002_0.png,0 +CXR_png/MCUCXR_0003_0.png,0 +CXR_png/MCUCXR_0004_0.png,0 +CXR_png/MCUCXR_0005_0.png,0 +CXR_png/MCUCXR_0006_0.png,0 +CXR_png/MCUCXR_0008_0.png,0 +CXR_png/MCUCXR_0011_0.png,0 +CXR_png/MCUCXR_0013_0.png,0 +CXR_png/MCUCXR_0015_0.png,0 diff --git a/tests/nih.csv b/tests/nih.csv new file mode 100644 index 0000000..7f5410a --- /dev/null +++ b/tests/nih.csv @@ -0,0 +1,11 @@ +,Patient ID,Image Index,Finding Labels,Follow-up #,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],has_masks +0,1,00000001_000.png,Cardiomegaly,0,57,M,PA,2682,2749,0.14300000000000002,0.14300000000000002,False +1,2,00000002_000.png,No Finding,0,80,M,PA,2500,2048,0.171,0.171,False +2,3,00000003_001.png,Hernia,0,74,F,PA,2500,2048,0.168,0.168,False +3,5,00000005_000.png,No Finding,0,69,F,PA,2048,2500,0.168,0.168,False +4,6,00000006_000.png,No Finding,0,81,M,PA,2500,2048,0.168,0.168,False +5,7,00000007_000.png,No Finding,0,82,M,PA,2500,2048,0.168,0.168,False +6,8,00000008_000.png,Cardiomegaly,0,68,F,PA,2048,2500,0.171,0.171,False +7,9,00000009_000.png,Emphysema,0,72,M,PA,2992,2991,0.14300000000000002,0.14300000000000002,False +8,10,00000010_000.png,Infiltration,0,84,F,PA,2992,2991,0.14300000000000002,0.14300000000000002,False +9,11,00000011_000.png,Effusion,0,74,M,PA,2638,2449,0.14300000000000002,0.14300000000000002,False diff --git a/tests/pc.csv b/tests/pc.csv new file mode 100644 index 0000000..ab49556 --- /dev/null +++ b/tests/pc.csv @@ -0,0 +1,11 @@ +,PatientID,Unnamed: 0,ImageID,ImageDir,StudyDate_DICOM,StudyID,PatientBirth,PatientSex_DICOM,ViewPosition_DICOM,Projection,MethodProjection,Pediatric,Modality_DICOM,Manufacturer_DICOM,PhotometricInterpretation_DICOM,PixelRepresentation_DICOM,PixelAspectRatio_DICOM,SpatialResolution_DICOM,BitsStored_DICOM,WindowCenter_DICOM,WindowWidth_DICOM,Rows_DICOM,Columns_DICOM,XRayTubeCurrent_DICOM,Exposure_DICOM,ExposureInuAs_DICOM,ExposureTime,RelativeXRayExposure_DICOM,ReportID,Report,MethodLabel,Labels,Localizations,LabelsLocalizationsBySentence,labelCUIS,LocalizationsCUIS +0,100002652742762245719447501824197948584,1691,125374151943505747025890313053997514922_j5rk5q.png,0,20170124,125374151943505747025890313053997514922,1953.0,F,PA,PA,Manual review of DICOM fields,No,DX,PhilipsMedicalSystems,MONOCHROME2,0,None,0.148,12,2047.5,4095,2279,2778,None,1,1200,5,369,5305389, escoliosis . sin hallazg pulmonar,Physician,['scoliosis'],[],"[['scoliosis'], ['normal'], ['normal']]",['C0036439'],[] +1,100008301845755693600822040133468866003,28441,117677712752528732526839762067921423608_ag14ef.png,10,20171023,117677712752528732526839762067921423608,1963.0,F,PA,PA,Manual review of DICOM fields,No,DX,PhilipsMedicalSystems,MONOCHROME2,0,None,0.148,12,2047.5,4095,2791,2628,None,1,900,3,379,5499322, sign atrap aere . sin hallazg radiolog signific .,Physician,"['COPD signs', 'air trapping']",[],"[['COPD signs', 'air trapping'], ['normal']]",['C0024117' 'C0231819'],[] +2,100013367876124980580350390072254146837,25017,336326204749794028912525116839151974960_vsaz23.png,8,20150211,336326204749794028912525116839151974960,1969.0,M,,PA,Manual review of DICOM fields,No,CR,PhilipsMedicalSystems,MONOCHROME2,0,"['1', '1']",0.2000000029802,12,2047.0,4095.0,1760,2140,None,None,None,None,210,4851188, estudi con escas grad inspiratori . atelectasi infiltr lamin lingul . liger pinzamient sen costofren derech . valor con clinic proces infecci neumon .,Physician,"['laminar atelectasis', 'pneumonia', 'suboptimal study', 'costophrenic angle blunting', 'infiltrates']","['loc lingula', 'loc costophrenic angle', 'loc right costophrenic angle']","[['laminar atelectasis', 'infiltrates', 'loc lingula'], ['pneumonia'], ['suboptimal study'], ['costophrenic angle blunting', 'loc right costophrenic angle', 'loc costophrenic angle']]",['C0032285' 'C2828075' 'C0742855' 'C0277877'],['C0225740' 'C0230151' 'C0504099'] +3,100026982716795250809914871389223660538,89912,216840111366964013534861372972012353112932019_01-129-187.png,27,20121218,216840111366964013534861372972012353112932019,1973.0,M,POSTEROANTERIOR,PA,Manual review of DICOM fields,No,CR,ImagingDynamicsCompanyLtd,MONOCHROME2,0,None,None,12,2177,2100,1764,1948,320,3,3200,10,-1.11,4384412, no apreci infiltr ni condens pulmonar .,RNN_model,['normal'],[],['normal'],[],[] +4,100028509573539327074611268168549994987,44189,216840111366964013663026955732013130130708697_02-042-053.png,13,20130522,216840111366964013663026955732013130130708697,1970.0,M,,PA,resnet-50.t7,No,CR,PhilipsMedicalSystems,MONOCHROME2,0,None,None,10,5.115000e+02,1.023000e+03,3520,4280,None,None,None,None,None,4469779, sin hallazg patolog signific,RNN_model,['normal'],[],['normal'],[],[] +5,100036099730704895710487466181648561260,29774,300276821619038978339766780609910437168_fgj94z.png,10,20170310,300276821619038978339766780609910437168,1941.0,M,,PA,Manual review of DICOM fields,No,CR,PhilipsMedicalSystems,MONOCHROME2,0,"['1', '1']",0.2000000029802,12,2047.0,4095.0,1760,2140,None,None,None,None,1144,5338259, sin hallazg signific . escoliosis dorsal,Physician,['scoliosis'],[],"[['scoliosis'], ['normal'], ['normal']]",['C0036439'],[] +6,100036618866701987672955579216452480417,133144,216840111366964013076187734852011294130558067_00-193-087.png,41,20111024,216840111366964013076187734852011294130558067,1934.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,No,CR,ImagingDynamicsCompanyLtd,MONOCHROME2,0,None,None,12,2195,2281,1664,1776,320,3,3200,10,-0.57,4141680, minim compresion par lateral derech traque puest relacion con pequen boci . cambi mecan column .,RNN_model,"['goiter', 'vertebral degenerative changes']","['loc tracheal', 'loc right']","['goiter', 'loc tracheal', 'loc right', 'vertebral degenerative changes']",['C0018021' 'C4290224'],['C0040578' 'C0444532'] +7,10005572179957571894246722745910504559,35656,318099584815157573909341904878106290504_erj1pn.png,54,20150423,318099584815157573909341904878106290504,1937.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,No,CR,ImagingDynamicsCompanyLtd,MONOCHROME2,0,None,None,12,1975,2116,3500,3620,400,2,2000,5,None,4902417, imagen compat con cambi inflamatori caract cronic bas derech . aort elong con calcificacion cay .,Physician,"['aortic button enlargement', 'chronic changes', 'aortic atheromatosis']","['loc aortic', 'loc right', 'loc basal']","[['aortic button enlargement', 'aortic atheromatosis', 'loc aortic'], ['chronic changes', 'loc basal', 'loc right']]",['C1851119' 'C0742362' 'C1096249'],['C0003483' 'C0444532' 'C1282378'] +8,100058298117820286456857046001529978359,73720,216840111366964013199786354762011304090209759_01-001-141.png,22,20111031,216840111366964013199786354762011304090209759,1941.0,M,POSTEROANTERIOR,PA,Manual review of DICOM fields,No,CR,PhilipsMedicalSystems,MONOCHROME2,0,"['1', '1']",0.2,12,2047.0,4095.0,1760,2140,None,None,None,None,330,4146530, elevacion hemidiafragm izquierd con pinzamient sen costofren probabl caract residual . parenquim sin alter .,RNN_model,"['costophrenic angle blunting', ' hemidiaphragm elevation']","['loc diaphragm', 'loc costophrenic angle', 'loc left']","['costophrenic angle blunting', ' hemidiaphragm elevation', 'loc diaphragm', 'loc costophrenic angle', 'loc left', 'normal']",['C0742855' 'C2073707'],['C0011980' 'C0230151' 'C0443246'] +9,100059873297205262788714053371897021487,15633,12986792586879524468475227325378905558_f8k8bq.png,5,20160901,12986792586879524468475227325378905558,1937.0,M,,PA,Manual review of DICOM fields,No,CR,PhilipsMedicalSystems,MONOCHROME2,0,"['1', '1']",0.143,12,2047.0,4095.0,2743,3000,0,0,None,0,429,5217128, no apreci infiltr consolid . cambi cronic parenquim pulmon con sign atrap aere . engros pleural biapical predomini derech . pinzamient sen costofren izquierd . ateromatosis aortic calcific .,Physician,"['aortic atheromatosis', 'chronic changes', 'costophrenic angle blunting', 'apical pleural thickening', 'air trapping']","['loc aortic', 'loc pleural', 'loc right', 'loc costophrenic angle', 'loc left costophrenic angle']","[['aortic atheromatosis', 'loc aortic'], ['chronic changes', 'air trapping'], ['normal'], ['costophrenic angle blunting', 'loc left costophrenic angle', 'loc costophrenic angle'], ['apical pleural thickening', 'loc pleural', 'loc right'], ['apical pleural thickening', 'loc pleural', 'loc right']]",['C1096249' 'C0742362' 'C0742855' 'C0231819'],['C0003483' 'C0032225' 'C0444532' 'C0230151' 'C0504100'] diff --git a/tests/random_data.py b/tests/random_data.py new file mode 100644 index 0000000..9c644af --- /dev/null +++ b/tests/random_data.py @@ -0,0 +1,253 @@ +import tarfile +import datetime +import numpy as np +import zipfile +import os +import shutil +import random +from PIL import Image +import glob +import pdb +import pydicom +import io +from pydicom.dataset import FileMetaDataset, FileDataset +from pydicom.encaps import encapsulate +import pydicom +import tempfile + +import io +from PIL import Image, ImageDraw +from pydicom.dataset import Dataset +from pydicom.uid import generate_uid, JPEGExtended +from pydicom._storage_sopclass_uids import SecondaryCaptureImageStorage +from pydicom import dcmread +from pydicom.encaps import encapsulate +import numpy as np + +def np_to_dcm(image, filename): + image = np.array(image) + WIDTH = image.shape[1] + HEIGHT = image.shape[2] + ds = Dataset() + ds.is_little_endian = True + ds.is_implicit_VR = True + ds.SOPClassUID = SecondaryCaptureImageStorage + ds.SOPInstanceUID = generate_uid() + ds.fix_meta_info() + ds.Modality = "OT" + ds.SamplesPerPixel = 3 + ds.BitsAllocated = 8 + ds.BitsStored = 8 + ds.HighBit = 7 + ds.PixelRepresentation = 0 + ds.PlanarConfiguration = 1 + ds.PhotometricInterpretation = "RGB" + ds.Rows = HEIGHT + ds.Columns = WIDTH + ds.PixelData = encapsulate([image.tobytes()]) + ds["PixelData"].is_undefined_length = True + ds.PhotometricInterpretation = "YBR_FULL_422" + ds.file_meta.TransferSyntaxUID = JPEGExtended + ds.save_as(filename, write_like_original=False) + +#def np_to_dcm(arr, filename): +# #Create object corresponding to file +# meta = FileMetaDataset() +# meta.MediaStorageSOPClassUID = '1.2.840.10008.5.1.4.1.1.2' +# meta.MediaStorageSOPInstanceUID = "1.2.3" +# meta.ImplementationClassUID = "1.2.3.4" +# dataset = FileDataset(filename, {}, file_meta = meta, preamble=b"\0" * 128) +# dataset.PatientName = "Test^Firstname" +# dataset.PatientID = "123456" +# #ds.is_little_endian = True +# dataset.is_little_endian = True +# dataset.is_implicit_VR = True +# dataset.file_meta.TransferSyntaxUID = pydicom.uid.ExplicitVRBigEndian +# # Set creation date/time +# dt = datetime.datetime.now() +# dataset.ContentDate = dt.strftime('%Y%m%d') +# timeStr = dt.strftime('%H%M%S.%f') # long format with micro seconds +# dataset.ContentTime = timeStr + +# dataset.Rows, dataset.Columns = arr.size + +# #ds.SOPClassUID = pydicom._storage_sopclass_uids.MRImageStorage +# dataset.PatientName = "Test^Firstname" +# dataset.PatientID = "123456" + +# #ds.Modality = "CT" +# #ds.SeriesInstanceUID = pydicom.uid.generate_uid() +# #ds.StudyInstanceUID = pydicom.uid.generate_uid() +# #ds.FrameOfReferenceUID = pydicom.uid.generate_uid() + +# dataset.BitsStored = 16 +# dataset.BitsAllocated = 16 +# dataset.SamplesPerPixel = 1 +# dataset.HighBit = 15 +# #ds.SliceLocation = DCM_SliceLocation +# #ds.SpacingBetweenSlices = 1 +# #ds.SliceThickness = 4 +# #ds.ScanLength = length + +# dataset.ImagesInAcquisition = "1" + +# dataset.InstanceNumber = 1 + +# #ds.ImagePositionPatient = r"-159\-174"+ "\\-" + str(DCM_SliceLocation*4) #default of 6, sometimes 1 +# #ds.ImageOrientationPatient = r"1\0\0\0\-1\0" +# #ds.ImageType = r"ORIGINAL\PRIMARY\AXIAL" + +# dataset.RescaleIntercept = "0" +# dataset.RescaleSlope = "1" +# dataset.PixelSpacing = r"0.683594\0.683594"# r"1\1" +# dataset.PhotometricInterpretation = "MONOCHROME2" +# dataset.PixelRepresentation = 1 + +# #Store image as bytes +# bytes_img = io.BytesIO() +# arr.save(bytes_img, format="PNG") +# #Add byte image to file +# dataset.PixelData = encapsulate(bytes_img.read()) +# pdb.set_trace() +# #dataset.pixel_data = np.array(arr) +# #Write file +# dataset.save_as(filename) + +def save_as_dicom(arr, filename): + file = FileDataset() + file.binary_data = arr + img_bytes = io.BytesIO() + Image.fromarray(arr).save(img_bytes, format="PNG") + file.PixelData = img_bytes + file.save_as(filename) + +def generate_random_image(dimensions): + dimensions = tuple(dimensions) + if len(dimensions) == 2: + dimensions = dimensions + (3,) + return Image.fromarray((np.random.random(dimensions)*255).astype("uint8")) + +class FolderOfArchive: + folder_format = "folder{}" + depth = 1 + def __init__(self, root, depth, archive_size=3): + self.root = root + self.depth = depth + self.archive_size = archive_size + self.current_archive = -1 + self.archive_position = archive_size - 1 + self.archives = [] + def get_path_from_root(self, n): + return os.path.join(*( + [self.folder_format.format(n)] * self.depth + \ + [self.archive_format.format(n)] + )) + def get_current_archive(self): + self.archive_position += 1 + if self.archive_position == self.archive_size: + new_path = os.path.join( + self.root, + self.get_path_from_root(self.current_archive) + ) + os.makedirs(os.path.dirname(new_path), exist_ok=True) + self.archives.append(self.get_new_archive(new_path)) + self.archive_position = 0 + self.current_archive += 1 + return self.archives[-1] + def close(self): + for archive in self.archives: + archive.close() + def write(self, content): + curr = self.get_current_archive() + self.add_to_archive(curr, content) + +class FolderOfTar(FolderOfArchive): + archive_format = "tar{}.tar" + def add_to_archive(self, archive, content): + archive.add(content) + def get_new_archive(self, new_path): + return tarfile.open(new_path, "w") + +class FolderOfTarGz(FolderOfArchive): + archive_format = "tar{}.tar.gz" + def add_to_archive(self, archive, content): + archive.add(content) + def get_new_archive(self, new_path): + return tarfile.open(new_path, "w:gz") + +class FolderOfZip(FolderOfArchive): + archive_format = "zip{}.zip" + def add_to_archive(self, archive, content): + archive.write(content) + def get_new_archive(self, new_path): + return zipfile.ZipFile(new_path, "w") + +def write_random_images(paths, extracted, tarname, zipname, folder_of_zip_name, folder_of_tar_gz_name, dimensions, subfolder="."): + folder_of_zip_d1_name = str(folder_of_zip_name) + "_1" + folder_of_zip_d2_name = str(folder_of_zip_name) + "_2" + folder_of_tar_gz_d1_name = str(folder_of_tar_gz_name) + "_1" + folder_of_tar_gz_d2_name = str(folder_of_tar_gz_name) + "_2" + for path in [extracted, tarname, zipname, folder_of_zip_d1_name, folder_of_zip_d2_name]: + if os.path.exists(path): + if os.path.isfile(path): + os.remove(path) + else: #dir + shutil.rmtree(path) + for img_fname in paths: + print(img_fname) + img_path = extracted/subfolder/img_fname + os.makedirs(os.path.dirname(img_path), exist_ok=True) + random_image = generate_random_image(dimensions) + if str(img_fname).endswith(".dcm"): + np_to_dcm(random_image, img_path) + else: + random_image.save(img_path) + tarred = tarfile.TarFile.open(tarname, "w") + zipped = zipfile.ZipFile(zipname,"w") + folder_of_zip_d1 = FolderOfZip(folder_of_zip_d1_name, 0) + folder_of_zip_d2 = FolderOfZip(folder_of_zip_d2_name, 1) + folder_of_tar_gz_d1 = FolderOfTarGz(folder_of_tar_gz_d1_name, 0) + folder_of_tar_gz_d2 = FolderOfTarGz(folder_of_tar_gz_d2_name, 1) + for file in extracted.rglob("*"): + if not os.path.isdir(file): + tarred.add(file) + zipped.write(file) + folder_of_zip_d1.write(file) + folder_of_zip_d2.write(file) + folder_of_tar_gz_d1.write(file) + folder_of_tar_gz_d2.write(file) + tarred.close() + zipped.close() + folder_of_zip_d1.close() + folder_of_zip_d2.close() + folder_of_tar_gz_d1.close() + folder_of_tar_gz_d2.close() + +def gen_hex(n): + hex_chars = list("0123456789abcdef") + return "".join(np.random.choice(hex_chars,n)) + +def gen_int(n): + int_chars = list("0123456789abcdef") + return "".join(np.random.choice(int_chars,n)) + +def random_pred(): + return random.choice(["1.0","-1.0","0.0",""]) + +def random_preds(): + return { + "Atelectasis":random_pred(), + "Cardiomegaly":random_pred(), + "Consolidation":random_pred(), + "Edema":random_pred(), + "Enlarged Cardiomediastinum":random_pred(), + "Fracture":random_pred(), + "Lung Lesion":random_pred(), + "Lung Opacity":random_pred(), + "No Finding":random_pred(), + "Pleural Effusion":random_pred(), + "Pleural Other":random_pred(), + "Pneumonia":random_pred(), + "Pneumothorax":random_pred(), + "Support Devices":random_pred() + } diff --git a/tests/shenzen.csv b/tests/shenzen.csv new file mode 100644 index 0000000..889c396 --- /dev/null +++ b/tests/shenzen.csv @@ -0,0 +1,11 @@ +fname,label +CXR_png/CHNCXR_0001_0.png,0 +CXR_png/CHNCXR_0002_0.png,0 +CXR_png/CHNCXR_0003_0.png,0 +CXR_png/CHNCXR_0004_0.png,0 +CXR_png/CHNCXR_0005_0.png,0 +CXR_png/CHNCXR_0006_0.png,0 +CXR_png/CHNCXR_0007_0.png,0 +CXR_png/CHNCXR_0008_0.png,0 +CXR_png/CHNCXR_0009_0.png,0 +CXR_png/CHNCXR_0010_0.png,0 diff --git a/tests/test_covid_data.csv b/tests/test_covid_data.csv new file mode 100644 index 0000000..a5c6187 --- /dev/null +++ b/tests/test_covid_data.csv @@ -0,0 +1,11 @@ +patientid,offset,sex,age,finding,RT_PCR_positive,survival,intubated,intubation_present,went_icu,in_icu,needed_supplemental_O2,extubated,temperature,pO2_saturation,leukocyte_count,neutrophil_count,lymphocyte_count,view,modality,date,location,folder,filename,doi,url,license,clinical_notes,other_notes +2,0,M,65,COVID-19,Y,Y,N,N,N,N,Y,,,,,,,PA,X-ray,"January 22, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc2001272,,"On January 22, 2020, a 65-year-old man with a history of hypertension, type 2 diabetes, coronary heart disease for which a stent had been implanted, and lung cancer was admitted to the emergency department of Cho Ray Hospital, the referral hospital in Ho Chi Minh City, for low-grade fever and fatigue. He had become ill with fever on January 17, a total of 4 days after he and his wife had flown to Hanoi from the Wuchang district in Wuhan, where outbreaks of 2019-nCoV were occurring. He reported that he had not been exposed to a “wet market” (a market where dead and live animals are sold) in Wuhan. Chest radiographs obtained on admission showed an infiltrate in the upper lobe of the left lung", +2,3,M,65,COVID-19,Y,Y,N,N,N,N,Y,,,,,,,PA,X-ray,"January 25, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc2001272,,"On January 22, 2020, a 65-year-old man with a history of hypertension, type 2 diabetes, coronary heart disease for which a stent had been implanted, and lung cancer was admitted to the emergency department of Cho Ray Hospital, the referral hospital in Ho Chi Minh City, for low-grade fever and fatigue. He had become ill with fever on January 17, a total of 4 days after he and his wife had flown to Hanoi from the Wuchang district in Wuhan, where outbreaks of 2019-nCoV were occurring. He reported that he had not been exposed to a “wet market” (a market where dead and live animals are sold) in Wuhan. On January 25, he received supplemental oxygen through a nasal cannula at a rate of 5 liters per minute because of increasing dyspnea with hypoxemia. The partial pressure of oxygen was 57.2 mm Hg while he was breathing ambient air, and a progressive infiltrate and consolidation were observed on chest radiographs", +2,5,M,65,COVID-19,Y,Y,N,N,N,N,Y,,,,,,,PA,X-ray,"January 27, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc2001272,,"On January 22, 2020, a 65-year-old man with a history of hypertension, type 2 diabetes, coronary heart disease for which a stent had been implanted, and lung cancer was admitted to the emergency department of Cho Ray Hospital, the referral hospital in Ho Chi Minh City, for low-grade fever and fatigue. He had become ill with fever on January 17, a total of 4 days after he and his wife had flown to Hanoi from the Wuchang district in Wuhan, where outbreaks of 2019-nCoV were occurring. He reported that he had not been exposed to a “wet market” (a market where dead and live animals are sold) in Wuhan. On January 25, he received supplemental oxygen through a nasal cannula at a rate of 5 liters per minute because of increasing dyspnea with hypoxemia. The partial pressure of oxygen was 57.2 mm Hg while he was breathing ambient air, and a progressive infiltrate and consolidation were observed on chest radiographs", +2,6,M,65,COVID-19,Y,Y,N,N,N,N,Y,,,,,,,PA,X-ray,"January 28, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc2001272,,"On January 22, 2020, a 65-year-old man with a history of hypertension, type 2 diabetes, coronary heart disease for which a stent had been implanted, and lung cancer was admitted to the emergency department of Cho Ray Hospital, the referral hospital in Ho Chi Minh City, for low-grade fever and fatigue. He had become ill with fever on January 17, a total of 4 days after he and his wife had flown to Hanoi from the Wuchang district in Wuhan, where outbreaks of 2019-nCoV were occurring. He reported that he had not been exposed to a “wet market” (a market where dead and live animals are sold) in Wuhan. Progressive infiltrate and consolidation", +4,0,F,52,COVID-19,Y,,N,N,N,N,N,,,,,,,PA,X-ray,"January 25, 2020","Changhua Christian Hospital, Changhua City, Taiwan ",images,nejmc2001573_f1a.jpeg,10.1056/NEJMc2001573,https://www.nejm.org/doi/full/10.1056/NEJMc2001573,,diffuse infiltrates in the bilateral lower lungs, +4,5,F,52,COVID-19,Y,,N,N,N,N,N,,,,,,,PA,X-ray,"January 30, 2020","Changhua Christian Hospital, Changhua City, Taiwan ",images,nejmc2001573_f1b.jpeg,10.1056/NEJMc2001573,https://www.nejm.org/doi/full/10.1056/NEJMc2001573,,progressive diffuse interstitial opacities and consolidation in the bilateral parahilar areas and lower lung fields, +5,,,,ARDS,,,Y,Y,Y,Y,,,,,,,,PA,X-ray,2017,,images,ARDSSevere.png,,https://en.wikipedia.org/wiki/File:ARDSSevere.png,CC BY-SA,Severe ARDS. Person is intubated with an OG in place., +6,0,,,COVID-19,Y,,Y,Y,Y,Y,,,,,,,,PA,X-ray,"January 6, 2020","Wuhan Jinyintan Hospital, Wuhan, Hubei Province, China",images,lancet-case2a.jpg,10.1016/S0140-6736(20)30211-7,https://www.thelancet.com/journals/lancet/article/PIIS0140-6736%2820%2930211-7/fulltext,,"Case 2: chest x-ray obtained on Jan 6 (2A). The brightness of both lungs was decreased and multiple patchy shadows were observed; edges were blurred, and large ground-glass opacity and condensation shadows were mainly on the lower right lobe. Tracheal intubation could be seen in the trachea. Heart shadow roughly presents in the normal range. On the left side, the diaphragmatic surface is not clearly displayed. The right side of the diaphragmatic surface was light and smooth and rib phrenic angle was less sharp. Chest x-ray on Jan 10 showed worse status (2B)", +6,4,,,COVID-19,Y,,Y,Y,Y,Y,,,,,,,,PA,X-ray,"January 10, 2020","Wuhan Jinyintan Hospital, Wuhan, Hubei Province, China",images,lancet-case2b.jpg,10.1016/S0140-6736(20)30211-7,https://www.thelancet.com/journals/lancet/article/PIIS0140-6736%2820%2930211-7/fulltext,,"Case 2: chest x-ray obtained on Jan 6 (2A). The brightness of both lungs was decreased and multiple patchy shadows were observed; edges were blurred, and large ground-glass opacity and condensation shadows were mainly on the lower right lobe. Tracheal intubation could be seen in the trachea. Heart shadow roughly presents in the normal range. On the left side, the diaphragmatic surface is not clearly displayed. The right side of the diaphragmatic surface was light and smooth and rib phrenic angle was less sharp. Chest x-ray on Jan 10 showed worse status (2B)", +3,4,M,74,SARS,,N,,,,,,,38,,,,,AP,X-ray,2004,"Mount Sinai Hospital, Toronto, Ontario, Canada",images,SARS-10.1148rg.242035193-g04mr34g0-Fig8a-day0.jpeg,10.1148/rg.242035193,https://pubs.rsna.org/doi/10.1148/rg.242035193,,SARS in a 74-year-old man who developed symptoms 4 days after exposure. Initial anteroposterior chest radiograph shows bilateral airspace disease that is more extensive in the left lung., diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 74f8df4..d21622b 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -1,19 +1,28 @@ import pytest +import pickle import torchxrayvision as xrv - +import os +import torch +from hashlib import blake2b +import time +from pathlib import Path +import pdb + dataset_classes = [xrv.datasets.NIH_Dataset, - xrv.datasets.PC_Dataset, + xrv.datasets.Openi_Dataset, xrv.datasets.NIH_Google_Dataset, - xrv.datasets.Openi_Dataset] + xrv.datasets.PC_Dataset] def test_dataloader_basic(): for dataset_class in dataset_classes: - dataset_class(imgpath=".") + c = dataset_class(imgpath=".") + c.image_interface.close() def test_dataloader_merging(): datasets = [] for dataset_class in dataset_classes: + print(dataset_class) dataset = dataset_class(imgpath=".") datasets.append(dataset) @@ -22,6 +31,9 @@ def test_dataloader_merging(): dd = xrv.datasets.Merge_Dataset(datasets) + for dataset in datasets: + dataset.image_interface.close() + # test that we catch incorrect pathology alignment def test_dataloader_merging_incorrect_alignment(): with pytest.raises(Exception) as excinfo: @@ -45,3 +57,247 @@ def test_dataloader_merging_incorrect_alignment(): assert "incorrect pathology alignment" in str(excinfo.value) + +def all_equal(items): + if len(items) == 1: + return True + return all([item == items[0] for item in items[1:]]) + +def _test_opening_formats(dataset_class, imgpaths, n=10, **kwargs): + sources = [] + for imgpath in imgpaths: + dataset = dataset_class(imgpath=imgpath, **kwargs) + #Add serial version + sources.append(dataset) + #Assert all items are the same in serial version + for i, one_item_from_each in enumerate(zip(*sources)): + print(i) + if i >= n - 1: + break + assert all_equal([pickle.dumps(item) for item in one_item_from_each]) + #Try loading each in a parallel way + for source in sources: + source.csv = source.csv.iloc[:10] + source.labels = source.labels[:10] + dataset = torch.utils.data.DataLoader( + source, + batch_size=10, + shuffle=False, + num_workers=8, + pin_memory=False + ) + for i, _ in enumerate(dataset): + if i >= n - 1: + break + #Ensure the second load is faster + for source in sources: + source.image_interface.close() + +def test_mimic_formats(): + _test_opening_formats( + xrv.datasets.MIMIC_Dataset, + imgpaths=[ + "tests/gen_mimic/images-224/files", + "tests/gen_mimic/images-224.tar", + "tests/gen_mimic/images-224.zip", + "tests/gen_mimic/images-224-zips_1", + "tests/gen_mimic/images-224-zips_2", + "tests/gen_mimic/images-224-tgzs_1", + "tests/gen_mimic/images-224-tgzs_2" + ], + csvpath="tests/gen_mimic/mimic-cxr-2.0.0-negbio.csv", + metacsvpath="tests/gen_mimic/mimic-cxr-2.0.0-metadata.csv" + ) + + +def test_nih_formats(): + _test_opening_formats( + xrv.datasets.NIH_Dataset, + imgpaths=[ + "tests/NIH_test_data/folder", + "tests/NIH_test_data/tar.tar", + "tests/NIH_test_data/zip.zip", + "tests/NIH_test_data/zipped_1", + "tests/NIH_test_data/zipped_2", + "tests/NIH_test_data/tgz_1", + "tests/NIH_test_data/tgz_2" + ], + csvpath="tests/nih.csv" + ) + +def test_pc_formats(): + _test_opening_formats( + xrv.datasets.PC_Dataset, + imgpaths=[ + "tests/PC_test_data/folder", + "tests/PC_test_data/tar.tar", + "tests/PC_test_data/zip.zip", + "tests/PC_test_data/zipped_1", + "tests/PC_test_data/zipped_2", + "tests/PC_test_data/tgz_1", + "tests/PC_test_data/tgz_2" + ], + csvpath="tests/pc.csv" + ) + +def test_shenzen_formats(): + _test_opening_formats( + xrv.datasets.NLMTB_Dataset, + imgpaths=[ + "tests/Shenzen_test_data/folder", + "tests/Shenzen_test_data/tar.tar", + "tests/Shenzen_test_data/zip.zip", + "tests/Shenzen_test_data/zipped_1", + "tests/Shenzen_test_data/zipped_2", + "tests/Shenzen_test_data/tgz_1", + "tests/Shenzen_test_data/tgz_2", + ] + ) + +def test_montgomery_formats(): + _test_opening_formats( + xrv.datasets.NLMTB_Dataset, + imgpaths=[ + "tests/Montgomery_test_data/folder", + "tests/Montgomery_test_data/tar.tar", + "tests/Montgomery_test_data/zip.zip", + "tests/Montgomery_test_data/zipped_1", + "tests/Montgomery_test_data/zipped_2", + "tests/Montgomery_test_data/tgz_1", + "tests/Montgomery_test_data/tgz_2" + ] + ) + +def test_rsna_jpg_formats(): + _test_opening_formats( + xrv.datasets.RSNA_Pneumonia_Dataset, + imgpaths=[ + "tests/RSNA_test_data_jpg/folder", + "tests/RSNA_test_data_jpg/tar.tar", + "tests/RSNA_test_data_jpg/zip.zip", + "tests/RSNA_test_data_jpg/zipped_1", + "tests/RSNA_test_data_jpg/zipped_2", + "tests/RSNA_test_data_jpg/tgz_1", + "tests/RSNA_test_data_jpg/tgz_2" + ], + csvpath="tests/rsna_train.csv" + ) + +def test_rsna_dcm_formats(): + _test_opening_formats( + xrv.datasets.RSNA_Pneumonia_Dataset, + imgpaths=[ + "tests/RSNA_test_data_dcm/folder", + "tests/RSNA_test_data_dcm/tar.tar", + "tests/RSNA_test_data_dcm/zip.zip", + "tests/RSNA_test_data_dcm/zipped_1", + "tests/RSNA_test_data_dcm/zipped_2", + "tests/RSNA_test_data_dcm/tgz_1", + "tests/RSNA_test_data_dcm/tgz_2" + ], + csvpath="tests/rsna_train.csv", + extension=".dcm" + ) + +def test_chex_formats(): + _test_opening_formats( + xrv.datasets.CheX_Dataset, + imgpaths=[ + "tests/CheXpert_test_data/folder", + "tests/CheXpert_test_data/tar.tar", + "tests/CheXpert_test_data/zip.zip", + "tests/CheXpert_test_data/zipped_1", + "tests/CheXpert_test_data/zipped_2", + "tests/CheXpert_test_data/tgz_1", + "tests/CheXpert_test_data/tgz_2" + ], + csvpath="tests/test_chexpert_data.csv" + ) + +def test_COVID_dataset(): + _test_opening_formats( + xrv.datasets.COVID19_Dataset, + imgpaths=[ + "tests/COVID_test_data/folder", + "tests/COVID_test_data/tar.tar", + "tests/COVID_test_data/zip.zip", + "tests/COVID_test_data/zipped_1", + "tests/COVID_test_data/zipped_2", + "tests/COVID_test_data/tgz_1", + "tests/COVID_test_data/tgz_2" + ], + csvpath="tests/test_covid_data.csv" + ) + +def test_openi_dataset(): + _test_opening_formats( + xrv.datasets.Openi_Dataset, + imgpaths=[ + "tests/Openi_test_data/folder", + "tests/Openi_test_data/tar.tar", + "tests/Openi_test_data/zip.zip", + "tests/Openi_test_data/zipped_1", + "tests/Openi_test_data/zipped_2", + "tests/Openi_test_data/tgz_1", + "tests/Openi_test_data/tgz_2" + ], + dicomcsv_path="tests/openi.csv" + ) + +def test_mimic_formats(): + _test_opening_formats( + xrv.datasets.MIMIC_Dataset, + imgpaths=[ + "tests/gen_mimic/images-224/files", + "tests/gen_mimic/images-224.tar", + "tests/gen_mimic/images-224.zip", + "tests/gen_mimic/images-224-zips_1", + "tests/gen_mimic/images-224-zips_2", + "tests/gen_mimic/images-224-tgzs_1", + "tests/gen_mimic/images-224-tgzs_2" + ], + csvpath="tests/gen_mimic/mimic-cxr-2.0.0-negbio.csv", + metacsvpath="tests/gen_mimic/mimic-cxr-2.0.0-metadata.csv" + ) + +def delete_cache_entry(imgpath, path_length): + timestamp = os.path.getmtime(imgpath) + imgpath = os.path.abspath(imgpath) + key = (imgpath, timestamp, path_length) + hash_value = blake2b(pickle.dumps(key)).hexdigest() + filename = str(hash_value) + ".pkl" + delete_path = os.path.expanduser(os.path.join("~",".torchxrayvision","filename-mapping-cache",filename)) + assert os.path.exists(delete_path) + if os.path.exists(delete_path): + os.remove(delete_path) + assert not os.path.exists(delete_path) + +def _test_second_load_faster(dataset, imgpath, *args, **kwargs): + delete_cache_entry( + imgpath, + dataset.path_length + ) + + #slow_times = [] + #fast_times = [] + + #for dataset, kwargs in zip(datasets, kwargs_sets): + slow_start = time.time() + dataset(*args, imgpath=imgpath, **kwargs) + slow_stop = time.time() + slow_time = slow_stop - slow_start + #slow_times.append(slow_time) + + #time.sleep(10) + + #for dataset, kwargs in zip(datasets, kwargs_sets): + + fast_start = time.time() + b = dataset(*args, imgpath=imgpath, **kwargs) + fast_stop = time.time() + fast_time = fast_stop - fast_start + #fast_times.append(fast_times) + + #for slow_time, fast_time in zip(slow_times, fast_times): + print(round(slow_time, 2), round(fast_time, 2)) + assert slow_time > fast_time diff --git a/torchxrayvision/datasets.py b/torchxrayvision/datasets.py index 3dadd5d..be84db6 100644 --- a/torchxrayvision/datasets.py +++ b/torchxrayvision/datasets.py @@ -1,6 +1,4 @@ -from PIL import Image from os.path import join -from skimage.io import imread, imsave from torch import nn from torch.nn.modules.linear import Linear from torch.utils.data import Dataset @@ -24,6 +22,7 @@ import tarfile import zipfile import random +from torchxrayvision.storage_interface import create_interface default_pathologies = [ 'Atelectasis', 'Consolidation', @@ -44,7 +43,7 @@ 'Lung Opacity', 'Enlarged Cardiomediastinum' ] - + thispath = os.path.dirname(os.path.realpath(__file__)) def normalize(sample, maxval): @@ -94,7 +93,6 @@ def check_paths_exist(self): class Merge_Dataset(Dataset): def __init__(self, datasets, seed=0, label_concat=False): - super(Merge_Dataset, self).__init__() np.random.seed(seed) # Reset the seed so all runs are the same. self.datasets = datasets self.length = 0 @@ -128,6 +126,8 @@ def __init__(self, datasets, seed=0, label_concat=False): self.csv = pd.concat([d.csv for d in datasets]) except: print("Could not merge dataframes (.csv not available):", sys.exc_info()[0]) + + super(Merge_Dataset, self).__init__() def __repr__(self): @@ -195,8 +195,8 @@ def __len__(self): def __getitem__(self, idx): return self.dataset[self.idxs[idx]] - class NIH_Dataset(Dataset): + path_length = 1 """ NIH ChestX-ray8 dataset @@ -224,7 +224,7 @@ def __init__(self, imgpath, super(NIH_Dataset, self).__init__() np.random.seed(seed) # Reset the seed so all runs are the same. - self.imgpath = imgpath + self.image_interface = create_interface(imgpath, self.path_length) self.csvpath = csvpath self.transform = transform self.data_aug = data_aug @@ -239,7 +239,6 @@ def __init__(self, imgpath, self.normalize = normalize # Load data - self.check_paths_exist() self.csv = pd.read_csv(self.csvpath, nrows=nrows) self.MAXVAL = 255 # Range [0 255] @@ -293,9 +292,7 @@ def __getitem__(self, idx): imgid = self.csv['Image Index'].iloc[idx] - img_path = os.path.join(self.imgpath, imgid) - #print(img_path) - img = imread(img_path) + img = self.image_interface.get_image(imgid) if self.normalize: img = normalize(img, self.MAXVAL) @@ -357,6 +354,7 @@ def get_mask_dict(self, image_name, this_size): return path_mask class RSNA_Pneumonia_Dataset(Dataset): + path_length = 1 """ RSNA Pneumonia Detection Challenge @@ -370,7 +368,7 @@ class RSNA_Pneumonia_Dataset(Dataset): """ def __init__(self, imgpath, - csvpath=os.path.join(thispath, "kaggle_stage_2_train_labels.csv.zip"), + csvpath=os.path.join(thispath, "kaggle_stage_2_train_labels.csv.zip"), dicomcsvpath=os.path.join(thispath, "kaggle_stage_2_train_images_dicom_headers.csv.gz"), views=["PA"], transform=None, @@ -434,6 +432,8 @@ def __init__(self, self.labels = np.asarray(self.labels).T self.labels = self.labels.astype(np.float32) + self.image_interface = create_interface(imgpath, self.path_length) + def __repr__(self): pprint.pprint(self.totals()) return self.__class__.__name__ + " num_samples={} views={}".format(len(self), self.views) @@ -447,13 +447,9 @@ def __getitem__(self, idx): sample["idx"] = idx sample["lab"] = self.labels[idx] - imgid = self.csv['patientId'].iloc[idx] - img_path = os.path.join(self.imgpath, imgid + self.extension) - #print(img_path) - if self.use_pydicom: - img=pydicom.filereader.dcmread(img_path).pixel_array - else: - img = imread(img_path) + imgid = self.csv['patientId'].iloc[idx] + self.extension + + img = self.image_interface.get_image(imgid) if self.normalize: img = normalize(img, self.MAXVAL) @@ -520,7 +516,7 @@ def get_mask_dict(self, image_name, this_size): return path_mask class NIH_Google_Dataset(Dataset): - + path_length = 1 """ Chest Radiograph Interpretation with Deep Learning Models: Assessment with Radiologist-adjudicated Reference Standards and Population-adjusted Evaluation @@ -585,6 +581,8 @@ def __init__(self, imgpath, # rename pathologies self.pathologies = np.char.replace(self.pathologies, "Airspace opacity", "Lung Opacity") + self.image_interface = create_interface(imgpath, self.path_length) + def __repr__(self): pprint.pprint(self.totals()) return self.__class__.__name__ + " num_samples={} views={}".format(len(self), self.views) @@ -594,9 +592,7 @@ def __len__(self): def __getitem__(self, idx): imgid = self.csv['Image Index'].iloc[idx] - img_path = os.path.join(self.imgpath, imgid) - #print(img_path) - img = imread(img_path) + img = self.image_interface.get(imgid) if self.normalize: img = normalize(img, self.MAXVAL) @@ -619,6 +615,7 @@ def __getitem__(self, idx): class PC_Dataset(Dataset): + path_length = 1 """ PadChest dataset Hospital San Juan de Alicante – University of Alicante @@ -665,13 +662,12 @@ def __init__(self, imgpath, mapping["Pleural_Thickening"] = ["pleural thickening"] mapping["Consolidation"] = ["air bronchogram"] - self.imgpath = imgpath + self.image_interface = create_interface(imgpath, self.path_length) self.transform = transform self.data_aug = data_aug self.flat_dir = flat_dir self.csvpath = csvpath - self.check_paths_exist() self.csv = pd.read_csv(self.csvpath, low_memory=False) self.MAXVAL = 65535 @@ -718,8 +714,7 @@ def __len__(self): def __getitem__(self, idx): imgid = self.csv['ImageID'].iloc[idx] - img_path = os.path.join(self.imgpath,imgid) - img = imread(img_path) + img = self.image_interface.get_image(imgid) img = normalize(img, self.MAXVAL) # Check that images are 2D arrays @@ -740,6 +735,7 @@ def __getitem__(self, idx): return {"img":img, "lab":self.labels[idx], "idx":idx} class CheX_Dataset(Dataset): + path_length = 4 """ CheXpert: A Large Chest Radiograph Dataset with Uncertainty Labels and Expert Comparison. Jeremy Irvin *, Pranav Rajpurkar *, Michael Ko, Yifan Yu, Silviana Ciurea-Ilcus, Chris Chute, Henrik Marklund, Behzad Haghgoo, Robyn Ball, Katie Shpanskaya, Jayne Seekins, David A. Mong, Safwan S. Halabi, Jesse K. Sandberg, Ricky Jones, David B. Larson, Curtis P. Langlotz, Bhavik N. Patel, Matthew P. Lungren, Andrew Y. Ng @@ -806,6 +802,8 @@ def __init__(self, imgpath, csvpath, views=["PA"], transform=None, data_aug=None self.pathologies = list(np.char.replace(self.pathologies, "Pleural Effusion", "Effusion")) + self.image_interface = create_interface(imgpath, self.path_length) + def __repr__(self): pprint.pprint(self.totals()) return self.__class__.__name__ + " num_samples={} views={}".format(len(self), self.views) @@ -817,8 +815,7 @@ def __getitem__(self, idx): imgid = self.csv['Path'].iloc[idx] imgid = imgid.replace("CheXpert-v1.0-small/","") - img_path = os.path.join(self.imgpath, imgid) - img = imread(img_path) + img = self.image_interface.get_image(imgid) img = normalize(img, self.MAXVAL) # Check that images are 2D arrays @@ -839,6 +836,7 @@ def __getitem__(self, idx): return {"img":img, "lab":self.labels[idx], "idx":idx} class MIMIC_Dataset(Dataset): + path_length = 4 """ Johnson AE, Pollard TJ, Berkowitz S, Greenbaum NR, Lungren MP, Deng CY, Mark RG, Horng S. MIMIC-CXR: A large publicly available database of labeled chest radiographs. arXiv preprint arXiv:1901.07042. 2019 Jan 21. @@ -849,8 +847,6 @@ class MIMIC_Dataset(Dataset): """ def __init__(self, imgpath, csvpath,metacsvpath, views=["PA"], transform=None, data_aug=None, flat_dir=True, seed=0, unique_patients=True): - - super(MIMIC_Dataset, self).__init__() np.random.seed(seed) # Reset the seed so all runs are the same. self.MAXVAL = 255 @@ -870,15 +866,16 @@ def __init__(self, imgpath, csvpath,metacsvpath, views=["PA"], transform=None, d self.pathologies = sorted(self.pathologies) - self.imgpath = imgpath + self.image_interface = create_interface(imgpath, self.path_length) self.transform = transform self.data_aug = data_aug self.csvpath = csvpath self.csv = pd.read_csv(self.csvpath) self.metacsvpath = metacsvpath self.metacsv = pd.read_csv(self.metacsvpath) - + self.csv = self.csv.set_index(['subject_id', 'study_id']) + self.metacsv = self.metacsv.set_index(['subject_id', 'study_id']) self.csv = self.csv.join(self.metacsv).reset_index() @@ -911,6 +908,8 @@ def __init__(self, imgpath, csvpath,metacsvpath, views=["PA"], transform=None, d # rename pathologies self.pathologies = np.char.replace(self.pathologies, "Pleural Effusion", "Effusion") + + super(MIMIC_Dataset, self).__init__() def __repr__(self): @@ -921,14 +920,13 @@ def __len__(self): return len(self.labels) def __getitem__(self, idx): - subjectid = str(self.csv.iloc[idx]["subject_id"]) studyid = str(self.csv.iloc[idx]["study_id"]) dicom_id = str(self.csv.iloc[idx]["dicom_id"]) - - img_path = os.path.join(self.imgpath, "p" + subjectid[:2], "p" + subjectid, "s" + studyid, dicom_id + ".jpg") - img = imread(img_path) - img = normalize(img, self.MAXVAL) + img_fname = os.path.join("p" + subjectid[:2], "p" + subjectid, "s" + studyid, dicom_id + ".jpg") + + img = self.image_interface.get_image(img_fname) + img = normalize(img, self.MAXVAL) # Check that images are 2D arrays if len(img.shape) > 2: @@ -948,6 +946,7 @@ def __getitem__(self, idx): return {"img":img, "lab":self.labels[idx], "idx":idx} class Openi_Dataset(Dataset): + path_length = 1 """ OpenI @@ -1053,6 +1052,8 @@ def __init__(self, imgpath, self.pathologies = np.char.replace(self.pathologies, "Opacity", "Lung Opacity") self.pathologies = np.char.replace(self.pathologies, "Lesion", "Lung Lesion") + self.image_interface = create_interface(imgpath, self.path_length) + def __repr__(self): pprint.pprint(self.totals()) return self.__class__.__name__ + " num_samples={}".format(len(self)) @@ -1061,10 +1062,8 @@ def __len__(self): return len(self.labels) def __getitem__(self, idx): - imageid = self.csv.iloc[idx].imageid - img_path = os.path.join(self.imgpath,imageid + ".png") - #print(img_path) - img = imread(img_path) + imageid = self.csv.iloc[idx].imageid + ".png" + img = self.image_interface.get_image(imageid) img = normalize(img, self.MAXVAL) # Check that images are 2D arrays @@ -1085,6 +1084,7 @@ def __getitem__(self, idx): return {"img":img, "lab":self.labels[idx], "idx":idx} class COVID19_Dataset(Dataset): + path_length = 1 """ COVID-19 image data collection @@ -1144,6 +1144,8 @@ def __init__(self, self.labels = np.asarray(self.labels).T self.labels = self.labels.astype(np.float32) + self.image_interface = create_interface(imgpath, self.path_length) + def __repr__(self): pprint.pprint(self.totals()) return self.__class__.__name__ + " num_samples={} views={}".format(len(self), self.views) @@ -1153,9 +1155,7 @@ def __len__(self): def __getitem__(self, idx): imgid = self.csv['filename'].iloc[idx] - img_path = os.path.join(self.imgpath, imgid) - #print(img_path) - img = imread(img_path) + img = self.image_interface.get_image(imgid) img = normalize(img, self.MAXVAL) # Check that images are 2D arrays @@ -1176,6 +1176,7 @@ def __getitem__(self, idx): return {"img":img, "lab":self.labels[idx], "idx":idx} class NLMTB_Dataset(Dataset): + path_length = 2 """ National Library of Medicine Tuberculosis Datasets https://lhncbc.nlm.nih.gov/publication/pub9931 @@ -1213,7 +1214,10 @@ def __init__(self, file_list = [] source_list = [] - for fname in sorted(os.listdir(os.path.join(self.imgpath, "CXR_png"))): + + self.image_interface = create_interface(imgpath, self.path_length) + + for fname in sorted(self.image_interface.filename_mapping): if fname.endswith(".png"): file_list.append(fname) @@ -1221,7 +1225,7 @@ def __init__(self, #Label is the last digit on the simage filename self.csv["label"] = self.csv["fname"].apply(lambda x: int(x.split(".")[-2][-1])) - + self.labels = self.csv["label"].values.reshape(-1,1) self.pathologies = ["Tuberculosis"] self.views = views @@ -1238,9 +1242,8 @@ def __len__(self): def __getitem__(self, idx): item = self.csv.iloc[idx] - img_path = os.path.join(self.imgpath, "CXR_png", item["fname"]) - #print(img_path) - img = imread(img_path) + imgid = item["fname"] #os.path.join("CXR_png", item["fname"]) + img = self.image_interface.get_image(imgid) img = normalize(img, self.MAXVAL) # Check that images are 2D arrays diff --git a/torchxrayvision/storage_interface.py b/torchxrayvision/storage_interface.py new file mode 100644 index 0000000..be9e36f --- /dev/null +++ b/torchxrayvision/storage_interface.py @@ -0,0 +1,330 @@ +from PIL import Image +import os +from hashlib import blake2b +import pickle +import zipfile +import tarfile +import multiprocessing +import pydicom +from pathlib import Path +import numpy as np +from io import BytesIO + +Image.init() #loads image extensions + +""" +You can read agnostically from folders, zipfiles, and tarfiles with this submodule. +You can retrieve each file using just the last n elements of its path (you pick n). + +You can create an interface using create_interface(imgpath, n). The path to the +archive or folder is imgpath, and the path length you will use for retrieving files +is n. + +Then, you can retrieve each file with the .get_image() method of the returned object. + +Example: + +interface = create_interface("/path/to/images.tar", n = 3) +interface.get_image("element1/element2/element3.jpg") #note n = 3 + +The "interface" object will belong to one of four classes: + TarInterface - for tarfiles + ZipInterface - for zipfiles + FolderInterface - for folders containing images + ArchiveFolder - for folders containing multiple tarfiles/zipfiles. + +""" + + +def last_n_in_filepath(filepath, n): + """ + Return the last n pieces of a path (takes a string, not a Path object). + For example: + last_n_in_filepath("a/b/c",2) -> "b/c" + """ + if n < 1: + return "" + start_part, end_part = os.path.split(filepath) + for i in range(n - 1): + start_part, middle_part = os.path.split(start_part) + end_part = os.path.join(middle_part, end_part) + return end_part + +def get_filename_mapping_path(imgpath, path_length): + """ + Create a hash of (imgpath, last_modification, path_length_for_mapping_key) + and use it to return the filepath for a cached index. + """ + imgpath = str(imgpath) #cannot be Path object + imgpath = os.path.abspath(imgpath) + timestamp = os.path.getmtime(imgpath) + length = path_length + key = (imgpath, timestamp, length) + + #Construct filename from hash of imgpath, timestamp, and length + cache_filename = str(blake2b(pickle.dumps(key)).hexdigest()) + ".pkl" + + file_mapping_cache_folder = os.path.expanduser(os.path.join( + "~", ".torchxrayvision", "filename-mapping-cache" + )) + + filename_mapping_path = os.path.join(file_mapping_cache_folder, cache_filename) + + return filename_mapping_path + +def load_filename_mapping(imgpath, path_length): + "If a cached filename mapping exists, return it. Otherwise, return None" + + filename_mapping_path = get_filename_mapping_path(imgpath, path_length) + + if os.path.exists(filename_mapping_path): + print("Loading indexed file paths from cache") + with open(filename_mapping_path, "rb") as handle: + filename_mapping = pickle.load(handle) + else: + filename_mapping = None + + return filename_mapping + +def save_filename_mapping(imgpath, path_length, filename_mapping): + "Load the dataset's index from the cache if available, else create a new one." + + filename_mapping_path = get_filename_mapping_path(imgpath, path_length) + + try: + #Pickle filename_mapping. + os.makedirs(os.path.dirname(filename_mapping_path), exist_ok=True) + with open(filename_mapping_path, "wb") as handle: + pickle.dump(filename_mapping, handle) + return True + + except: + return False + +def convert_to_image(filename, bytes): + "Convert an image byte array to a numpy array. If the filename ends with .dcm, use pydicom." + if str(filename).endswith(".dcm"): + return pydicom.filereader.dcmread(BytesIO(bytes), force=True).pixel_array + else: + return np.array(Image.open(BytesIO(bytes))) + +class StorageInterface(object): + pass + +class TarInterface(StorageInterface): + "This class supports extracting files from a tar archive based on a partial path." + @classmethod + def matches(cls, filename): + "Return whether the given path is a tar archive." + return not os.path.isdir(filename) and tarfile.is_tarfile(filename) + def __init__(self, imgpath, path_length): + "Store the archive path, and the length of the partial paths within the archive" + self.path_length = path_length + self.imgpath = imgpath + + #Load archive and filename mapping + compressed = None + self.filename_mapping = load_filename_mapping(imgpath, path_length) + #If the filename mapping could not be loaded, create it and save it + if self.filename_mapping is None: + compressed, self.filename_mapping = self.index(imgpath) + save_filename_mapping(imgpath, path_length, self.filename_mapping) + #If the compressed file has still not been loaded, load it. + if compressed is None: + compressed = tarfile.open(imgpath) + self.all_compressed = {multiprocessing.current_process().name:compressed} + + def get_image(self, imgid): + "Return the image object for the partial path provided." + archive_path = self.filename_mapping[imgid] + if not multiprocessing.current_process().name in self.all_compressed: + #print("Opening tar file on thread:",pid) + # check and reset number of open files if too many + if len(self.all_compressed.keys()) > 64: + self.all_compressed = {} + self.all_compressed[multiprocessing.current_process().name] = tarfile.open(self.imgpath) + bytes = self.all_compressed[multiprocessing.current_process().name].extractfile(archive_path).read() + return convert_to_image(archive_path, bytes) + def index(self, imgpath): + "Create a dictionary mapping imgpath -> path within archive" + print("Indexing file paths (one-time). The next load will be faster") + compressed = tarfile.open(imgpath) + tar_infos = compressed.getmembers() + filename_mapping = {} + for tar_info in tar_infos: + if tar_info.type != "DIRTYPE": + tar_path = tar_info.name + imgid = last_n_in_filepath(tar_path, self.path_length) + filename_mapping[imgid] = tar_path + return compressed, filename_mapping + def close(self): + "Close all open tarfiles." + for compressed in self.all_compressed.values(): + compressed.close() + +class ZipInterface(StorageInterface): + "This class supports extracting files from a zip archive based on a partial path." + @classmethod + def matches(cls, filename): + "Return whether the given path is a zip archive." + return not os.path.isdir(filename) and zipfile.is_zipfile(filename) + def __init__(self, imgpath, path_length): + "Store the archive path, and the length of the partial paths within the archive" + self.path_length = path_length + self.imgpath = imgpath + + #Load archive and filename mapping + compressed = None + self.filename_mapping = load_filename_mapping(imgpath, path_length) + #If the filename mapping could not be loaded, create it and save it + if self.filename_mapping is None: + compressed, self.filename_mapping = self.index(imgpath) + save_filename_mapping(imgpath, path_length, self.filename_mapping) + #If the compressed file has still not been loaded, load it. + if compressed is None: + compressed = zipfile.ZipFile(imgpath) + self.all_compressed = {multiprocessing.current_process().name:compressed} + + def get_image(self, imgid): + "Return the image object for the partial path provided." + archive_path = self.filename_mapping[imgid] + if not multiprocessing.current_process().name in self.all_compressed: + #print("Opening zip file on thread:",multiprocessing.current_process()) + # check and reset number of open files if too many + if len(self.all_compressed.keys()) > 64: + self.all_compressed = {} + self.all_compressed[multiprocessing.current_process().name] = zipfile.ZipFile(self.imgpath) + bytes = self.all_compressed[multiprocessing.current_process().name].open(archive_path).read() + return convert_to_image(archive_path, bytes) + def index(self, imgpath): + "Create a dictionary mapping imgpath -> path within archive" + print("Indexing file paths (one-time). The next load will be faster") + compressed = zipfile.ZipFile(imgpath) + zip_infos = compressed.infolist() + filename_mapping = {} + for zip_info in zip_infos: + if not zip_info.is_dir(): + zip_path = zip_info.filename + imgid = last_n_in_filepath(zip_path, self.path_length) + filename_mapping[imgid] = zip_path + return compressed, filename_mapping + def close(self): + "Close all open zipfiles." + for compressed in self.all_compressed.values(): + compressed.close() + +class FolderInterface(StorageInterface): + "This class supports drawing files from a folder based on a partial path." + + @classmethod + def matches(cls, filename): + "Return whether the given path is a zip archive." + return os.path.isdir(filename) + + def __init__(self, imgpath, path_length): + "Store the archive path, and the length of the partial paths within the archive" + self.path_length = path_length + + self.filename_mapping = load_filename_mapping(imgpath, path_length) + #If the filename mapping could not be loaded, create it and save it + if self.filename_mapping is None: + _, self.filename_mapping = self.index(imgpath) + save_filename_mapping(imgpath, path_length, self.filename_mapping) + + def get_image(self, imgid): + "Return the image object for the partial path provided." + archive_path = self.filename_mapping[imgid] + with open(archive_path,"rb") as handle: + image = convert_to_image(archive_path, handle.read()) + return image + def index(self, imgpath): + "Create a dictionary mapping imgpath -> path within archive" + print("Indexing file paths (one-time). The next load will be faster") + filename_mapping = {} + for path in Path(imgpath).rglob("*"): + if not os.path.isdir(path): + imgid = last_n_in_filepath(path, self.path_length) + filename_mapping[imgid] = path + return imgpath, filename_mapping + def close(self): + pass + +def is_image(filename): + "Return whether the given filename has an image extension." + _, extension = os.path.splitext(filename) + return extension in Image.EXTENSION + +archive_interfaces = [ZipInterface, TarInterface] + +def is_archive(filename): + "Return whether the given filename is a tarfile or zipfile." + return any(interface.matches(filename) for interface in archive_interfaces) + + +class ArchiveFolder(StorageInterface): + "This class supports extracting files from multiple tar or zip archives under the same root directory." + + @classmethod + def matches(cls, filename): + for item in Path(filename).rglob("*"): + if is_image(item): + return False + if is_archive(item): + return True + return False + + def __init__(self, imgpath, path_length): + "Store the archive path, and the length of the partial paths within the archive" + self.path_length = path_length + self.archives = None + self.filename_mapping = load_filename_mapping(imgpath, path_length) + #If the filename mapping could not be loaded, create it and save it + if self.filename_mapping is None: + self.archives, self.filename_mapping = self.index(imgpath) + save_filename_mapping(imgpath, path_length, self.filename_mapping) + #If the compressed file has still not been loaded, load it. + if self.archives is None: + self.archives = self.get_archive(imgpath) + + def get_image(self, imgid): + "Return the image object for the partial path provided." + path_to_archive = self.filename_mapping[imgid] + return self.archives[path_to_archive].get_image(imgid) + + def index(self, filename): + """ + Create a dictionary mapping imgid -> containing sub-archive. + The archives are identified by their filenames. The sub-archive + will then be queried itself. + + This is different from the index method of ZipInterface and + TarInterface, where the dictionary values are the actual file + paths. + """ + archives = self.get_archive(filename) + filename_mapping = {} + for path_to_archive, archive in archives.items(): + for path_in_csv, path_in_archive in archive.filename_mapping.items(): + filename_mapping[path_in_csv] = path_to_archive + return archives, filename_mapping + + def get_archive(self, filename): + archives = {} + for path_to_archive in Path(filename).rglob("*"): + if is_archive(path_to_archive): + archive = create_interface(path_to_archive, self.path_length) + archives[path_to_archive] = archive + return archives + + def close(self): + "Recursively close all open archives." + for archive_path, archive in self.archives.items(): + archive.close() + +interfaces = [ArchiveFolder, FolderInterface, TarInterface, ZipInterface] + +def create_interface(filename, path_length, interfaces=interfaces): + "Choose the right interface type for the given path, and return an initialized interface." + for interface in interfaces: + if interface.matches(filename): + return interface(filename, path_length)