From b40c10607597bc01969717e9f2ffe82e2be299a2 Mon Sep 17 00:00:00 2001 From: rusleak Date: Tue, 13 May 2025 14:18:44 +0200 Subject: [PATCH 1/2] Add timeouts, exception handling, unique filenames, and file type checks to improve safety and efficiency. - Timeout for HTTP requests and image downloads to prevent hanging. - Handle request and image download errors gracefully. - Ensure only `.jpg` and `.png` files are downloaded. - Prevent file overwriting by adding UUIDs to image filenames. --- app.py | 78 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/app.py b/app.py index 68c1d0e..b19f27c 100644 --- a/app.py +++ b/app.py @@ -3,81 +3,93 @@ import requests from PIL import Image from bs4 import BeautifulSoup +import uuid # For unique filenames + class ImageSpider: def __init__(self): self.home = os.getcwd() - + def grab_all_image_links(self, URL): try: valid_links = [] url_protocol = URL.split('/')[0] - url_html = requests.get(URL).text - Image_urls = re.findall(r'((http\:|https\:)?\/\/[^"\' ]*?\.(png|jpg))', url_html, flags=re.IGNORECASE | re.MULTILINE | re.UNICODE) + url_html = requests.get(URL, timeout=10).text # #1 Timeout for the request for safety + Image_urls = re.findall(r'((http\:|https\:)?\/\/[^"\' ]*?\.(png|jpg))', url_html, + flags=re.IGNORECASE | re.MULTILINE | re.UNICODE) for image in Image_urls: image_url = image[0] if not image_url.startswith(url_protocol): - image_url = url_protocol+image_url - valid_links.append(image_url) - else: - valid_links.append(image_url) + image_url = url_protocol + image_url + valid_links.append(image_url) print('Done') - except Exception as graberror: - print('Grab occured while getting links') + except requests.RequestException as graberror: # #2 Error handler for request issues + print('Grab error while getting links') print(graberror) - return [] + return [] return valid_links @staticmethod def extract_image_name(url): + # #3 Generating unique filenames image_name = str(url).split('/')[-1] - return image_name + unique_name = f"{uuid.uuid4()}_{image_name}" # Adding a unique identifier to prevent overwriting + return unique_name @staticmethod def extract_site_name(url): sitename = str(url).split('/')[2] return sitename - - def saving_images(self,url): + + def saving_images(self, url): Image_links = self.grab_all_image_links(url) for link in Image_links: - raw_image = requests.get(link, stream=True).raw - img = Image.open(raw_image) - image_name = self.extract_image_name(link) - img.save(image_name) - + if not link.endswith(('.jpg', '.png')): # #4 Check file extension before downloading + continue + try: + raw_image = requests.get(link, stream=True, timeout=10).raw # #5 Timeout for downloading images + img = Image.open(raw_image) + image_name = self.extract_image_name(link) + img.save(image_name) + except Exception as e: # #6 Error handler for image download issues + print(f"Failed to download {link}: {e}") + def grab_all_links(self, url): links = [url] - link_html = requests.get(url).text - all_links = BeautifulSoup(link_html, 'html.parser').findAll('a') - for link in all_links: - href = link.get('href') - if href: - if href.startswith('http') or href.startswith('https'): - links.append(href) + try: + link_html = requests.get(url, timeout=10).text # #7 Timeout for fetching all links + all_links = BeautifulSoup(link_html, 'html.parser').findAll('a') + for link in all_links: + href = link.get('href') + if href: + if href.startswith('http') or href.startswith('https'): + links.append(href) + except requests.RequestException as e: # #8 Error handler for all link requests + print(f"Failed to grab links from {url}: {e}") return links def download_images(self): url = input('Enter URL with images : ') try: sitename = self.extract_site_name(url) - print('Extracting from {} ...'.format(sitename)) - os.mkdir(sitename);os.chdir(sitename) - print('\nShould we can scan entire site or just home page ?') - option = int(input('1. Entire site\n2.Just this page\nOption : ')) + print(f'Extracting from {sitename} ...') + os.mkdir(sitename) + os.chdir(sitename) + print('\nShould we scan the entire site or just the home page?') + option = int(input('1. Entire site\n2. Just this page\nOption : ')) if option == 1: all_avaialble_links = set(self.grab_all_links(url)) else: all_avaialble_links = [url] for link in all_avaialble_links: - try: + try: print(link) self.saving_images(link) - except: - continue + except Exception as e: # #9 Error handler for saving images + print(f"Error while saving images from {link}: {e}") except Exception as Error: - print('Error occured while grabing site links') + print('Error occurred while grabbing site links') print(Error) finally: From af76e29fc6989e103e4d670df3704ed480bb53d4 Mon Sep 17 00:00:00 2001 From: rusleak Date: Sun, 1 Feb 2026 15:46:20 +0100 Subject: [PATCH 2/2] Fuzzy controller implemented --- app.py | 153 ++++++++++++++++++++------------------------------------- 1 file changed, 54 insertions(+), 99 deletions(-) diff --git a/app.py b/app.py index b19f27c..28ed08d 100644 --- a/app.py +++ b/app.py @@ -1,101 +1,56 @@ -import os -import re -import requests -from PIL import Image -from bs4 import BeautifulSoup -import uuid # For unique filenames +import numpy as np +import skfuzzy as fuzz +from skfuzzy import control as ctrl + +# --- Declaration of range of variables --- +weight = ctrl.Antecedent(np.arange(0, 1001, 1), 'weight') +dryer = ctrl.Antecedent(np.arange(0, 101, 1), 'dryer') +time = ctrl.Consequent(np.arange(0, 241, 1), 'time') + +# --- Функции принадлежности --- +weight['low'] = fuzz.trimf(weight.universe, [0, 0, 200]) +weight['medium'] = fuzz.trimf(weight.universe, [150, 325, 500]) +weight['high'] = fuzz.trimf(weight.universe, [400, 700, 1000]) + +dryer['low'] = fuzz.trimf(dryer.universe, [0, 0, 40]) +dryer['medium'] = fuzz.trimf(dryer.universe, [30, 50, 70]) +dryer['high'] = fuzz.trimf(dryer.universe, [60, 100, 100]) + +time['low'] = fuzz.trimf(time.universe, [0, 0, 60]) +time['medium'] = fuzz.trimf(time.universe, [50, 85, 120]) +time['high'] = fuzz.trimf(time.universe, [100, 170, 240]) + +#Visualisation of membership function +weight.view() +dryer.view() +time.view() + + +# --- Rules --- +rule1 = ctrl.Rule(weight['low'] & dryer['high'], time['low']) +rule2 = ctrl.Rule(weight['low'] & dryer['medium'], time['medium']) +rule3 = ctrl.Rule(weight['low'] & dryer['low'], time['medium']) +rule4 = ctrl.Rule(weight['medium'] & dryer['high'], time['medium']) +rule5 = ctrl.Rule(weight['medium'] & dryer['medium'], time['medium']) +rule6 = ctrl.Rule(weight['medium'] & dryer['low'], time['high']) +rule7 = ctrl.Rule(weight['high'] & dryer['high'], time['medium']) +rule8 = ctrl.Rule(weight['high'] & dryer['medium'], time['high']) +rule9 = ctrl.Rule(weight['high'] & dryer['low'], time['high']) + +# --- Creating controller --- +drying_ctrl = ctrl.ControlSystem([rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9]) +drying = ctrl.ControlSystemSimulation(drying_ctrl) + +# --- Example 1 --- +drying.input['weight'] = 150 +drying.input['dryer'] = 80 +drying.compute() +print("Пример 1: время сушки =", drying.output['time'], "мин") # ожидаем короткое время + +# --- Example 2 --- +drying.input['weight'] = 600 +drying.input['dryer'] = 30 +drying.compute() +print("Пример 2: время сушки =", drying.output['time'], "мин") # ожидаем длительное время -class ImageSpider: - def __init__(self): - self.home = os.getcwd() - - def grab_all_image_links(self, URL): - try: - valid_links = [] - url_protocol = URL.split('/')[0] - url_html = requests.get(URL, timeout=10).text # #1 Timeout for the request for safety - Image_urls = re.findall(r'((http\:|https\:)?\/\/[^"\' ]*?\.(png|jpg))', url_html, - flags=re.IGNORECASE | re.MULTILINE | re.UNICODE) - for image in Image_urls: - image_url = image[0] - if not image_url.startswith(url_protocol): - image_url = url_protocol + image_url - valid_links.append(image_url) - print('Done') - except requests.RequestException as graberror: # #2 Error handler for request issues - print('Grab error while getting links') - print(graberror) - return [] - return valid_links - - @staticmethod - def extract_image_name(url): - # #3 Generating unique filenames - image_name = str(url).split('/')[-1] - unique_name = f"{uuid.uuid4()}_{image_name}" # Adding a unique identifier to prevent overwriting - return unique_name - - @staticmethod - def extract_site_name(url): - sitename = str(url).split('/')[2] - return sitename - - def saving_images(self, url): - Image_links = self.grab_all_image_links(url) - for link in Image_links: - if not link.endswith(('.jpg', '.png')): # #4 Check file extension before downloading - continue - try: - raw_image = requests.get(link, stream=True, timeout=10).raw # #5 Timeout for downloading images - img = Image.open(raw_image) - image_name = self.extract_image_name(link) - img.save(image_name) - except Exception as e: # #6 Error handler for image download issues - print(f"Failed to download {link}: {e}") - - def grab_all_links(self, url): - links = [url] - try: - link_html = requests.get(url, timeout=10).text # #7 Timeout for fetching all links - all_links = BeautifulSoup(link_html, 'html.parser').findAll('a') - for link in all_links: - href = link.get('href') - if href: - if href.startswith('http') or href.startswith('https'): - links.append(href) - except requests.RequestException as e: # #8 Error handler for all link requests - print(f"Failed to grab links from {url}: {e}") - return links - - def download_images(self): - url = input('Enter URL with images : ') - try: - sitename = self.extract_site_name(url) - print(f'Extracting from {sitename} ...') - os.mkdir(sitename) - os.chdir(sitename) - print('\nShould we scan the entire site or just the home page?') - option = int(input('1. Entire site\n2. Just this page\nOption : ')) - if option == 1: - all_avaialble_links = set(self.grab_all_links(url)) - else: - all_avaialble_links = [url] - for link in all_avaialble_links: - try: - print(link) - self.saving_images(link) - except Exception as e: # #9 Error handler for saving images - print(f"Error while saving images from {link}: {e}") - - except Exception as Error: - print('Error occurred while grabbing site links') - print(Error) - - finally: - print('Scraping finished') - os.chdir(self.home) - - -spider = ImageSpider() -spider.download_images()