diff --git a/idt/bing.py b/idt/bing.py index f2e780c..53be9f7 100644 --- a/idt/bing.py +++ b/idt/bing.py @@ -1,12 +1,13 @@ import os -import json import requests import re -from idt.utils.download_images import download +from idt.utils.download_thread_images import downloadThread from idt.utils.remove_corrupt import erase_duplicates from rich.progress import Progress +from concurrent.futures import ThreadPoolExecutor +import concurrent.futures __name__ = "bing" @@ -34,6 +35,7 @@ def search(self): data = data[1:] page_counter = 0 + future_list = [] with Progress() as progress: task1 = progress.add_task(f"Downloading [blue]{self.data}[/blue] class...",total=self.n_images) while self.downloaded_images < self.n_images: @@ -52,15 +54,15 @@ def search(self): if not os.path.exists(target_folder): os.mkdir(target_folder) - for link in results: - try: - if self.downloaded_images < self.n_images: - download(link,self.size,self.root_folder,self.folder, self.resize_method) - self.downloaded_images += 1 - progress.update(task1, advance=1) - else: - break; - except: - continue + with ThreadPoolExecutor(max_workers=5) as executor: + future_list += [executor.submit(downloadThread, link, self) for link in results] + for future in concurrent.futures.as_completed(future_list): + try: + future.result() + except Exception as exc: + pass + else: + progress.update(task1, advance=1) + self.downloaded_images -= erase_duplicates(target_folder) print('Done') diff --git a/idt/bing_api.py b/idt/bing_api.py index 3fd7520..c4ed63e 100644 --- a/idt/bing_api.py +++ b/idt/bing_api.py @@ -64,5 +64,5 @@ def search(self): break; except: continue - self.downloaded_images -= erase_duplicates(target_folder) + self.downloaded_images -= erase_duplicates(target_folder) generate_class_info(self.dataset_info,self.root_folder, self.folder) diff --git a/idt/duckgo.py b/idt/duckgo.py index b6e77dd..8f6cb24 100644 --- a/idt/duckgo.py +++ b/idt/duckgo.py @@ -2,15 +2,15 @@ import re; import json; import time; -import logging; import os; from rich.progress import Progress +from concurrent.futures import ThreadPoolExecutor +import concurrent.futures -from idt.utils.download_images import download +from idt.utils.download_thread_images import downloadThread from idt.utils.remove_corrupt import erase_duplicates __name__ = "duckgo" - class DuckGoSearchEngine: def __init__(self, data, n_images, folder, resize_method, root_folder, size): self.data = data @@ -21,7 +21,7 @@ def __init__(self, data, n_images, folder, resize_method, root_folder, size): self.size = size self.downloaded_images = 0 self.search() - + def search(self): URL = 'https://duckduckgo.com/' PARAMS = {'q': self.data} @@ -38,7 +38,6 @@ def search(self): res = requests.post(URL, data=PARAMS, timeout=3.000) search_object = re.search(r'vqd=([\d-]+)\&', res.text, re.M|re.I) - #print(search_object) if not search_object: return -1; @@ -53,15 +52,15 @@ def search(self): ('v7exp', 'a')) request_url = URL + "i.js"; + future_list = [] with Progress() as progress: - task1 = progress.add_task("[blue]Downloading {x} class...".format(x=self.data), total=self.n_images) while self.downloaded_images < self.n_images: while True: try: - res = requests.get(request_url, headers=HEADERS, params=PARAMS, timeout=3.000); + res = requests.get(request_url, headers=HEADERS, params=PARAMS, timeout=3.000, stream=True); data = json.loads(res.text); - break; + break except ValueError as e: time.sleep(5); continue; @@ -77,16 +76,18 @@ def search(self): if len(data["results"]) > self.n_images - self.downloaded_images: data["results"] = data["results"][:self.n_images - self.downloaded_images] - for results in data["results"]: - try: - download(results["image"], self.size, self.root_folder, self.folder, self.resize_method) - self.downloaded_images+= 1 - progress.update(task1, advance=1) - except Exception as e: - continue - + with ThreadPoolExecutor(max_workers=5) as executor: + future_list += [executor.submit(downloadThread, results["image"], self) for results in data["results"]] + for future in concurrent.futures.as_completed(future_list): + try: + future.result() + except Exception as exc: + pass + else: + progress.update(task1, advance=1) + self.downloaded_images -= erase_duplicates(target_folder) if "next" not in data: return 0 - request_url = URL + data["next"]; + request_url = URL + data["next"]; \ No newline at end of file diff --git a/idt/flickr_api.py b/idt/flickr_api.py index 70296e9..74f4a70 100644 --- a/idt/flickr_api.py +++ b/idt/flickr_api.py @@ -71,4 +71,4 @@ def search(self): break; except: continue - self.downloaded_images -= erase_duplicates(target_folder) + self.downloaded_images -= erase_duplicates(target_folder) diff --git a/idt/utils/download_thread_images.py b/idt/utils/download_thread_images.py new file mode 100644 index 0000000..432743c --- /dev/null +++ b/idt/utils/download_thread_images.py @@ -0,0 +1,12 @@ +from idt.utils.download_images import download + +__name__ = "download_thread_images" +def downloadThread(link, self): + try: + if(self.downloaded_images <= self.n_images): + download(link, self.size, self.root_folder, self.folder, self.resize_method) + self.downloaded_images += 1 + else: + raise Exception("Exceed") + except Exception as e: + pass \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1fef6ed..00d8d91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ requests==2.22.0 Pillow==7.0.0 rich==6.1.2 numpy==1.19.1 - +futures==3.3.0