2022-08-30 12:28:59 +02:00
|
|
|
import os
|
|
|
|
from urllib import request
|
|
|
|
from urllib.error import HTTPError, URLError
|
|
|
|
from bs4 import BeautifulSoup, Tag, ResultSet
|
|
|
|
|
2022-08-31 23:53:24 +02:00
|
|
|
|
2022-08-30 12:28:59 +02:00
|
|
|
class WebPicDownloader():
|
|
|
|
"""
|
|
|
|
WebPicDownloader
|
|
|
|
|
|
|
|
webpicdownloader is a simple tool able to
|
|
|
|
find and download all pictures on a webpage.
|
|
|
|
|
|
|
|
@author EndMove <contact@endmove.eu>
|
2022-08-31 21:05:20 +02:00
|
|
|
@version 1.1.0
|
2022-08-30 12:28:59 +02:00
|
|
|
"""
|
|
|
|
# Variables
|
2022-08-31 21:05:20 +02:00
|
|
|
path: str = None
|
2022-08-31 23:53:24 +02:00
|
|
|
messenger = None
|
2022-08-31 21:05:20 +02:00
|
|
|
headers: dict = None
|
2022-08-30 12:28:59 +02:00
|
|
|
|
|
|
|
# Constructor
|
|
|
|
def __init__(self, path: str = os.getcwd()) -> None:
|
|
|
|
"""Constructor"""
|
|
|
|
self.path = path
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger = lambda message: print(message)
|
2022-08-31 21:05:20 +02:00
|
|
|
self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"}
|
2022-08-30 12:28:59 +02:00
|
|
|
|
|
|
|
# Internal functions
|
2022-08-31 21:05:20 +02:00
|
|
|
def __get_html(self, url: str) -> str:
|
2022-08-30 12:28:59 +02:00
|
|
|
"""Allow to retrieve the HTML content of a website"""
|
|
|
|
req = request.Request(url, headers=self.headers)
|
|
|
|
response = request.urlopen(req)
|
|
|
|
return response.read().decode('utf-8')
|
|
|
|
|
2022-08-31 21:05:20 +02:00
|
|
|
def __find_all_img(self, html: str) -> ResultSet:
|
2022-08-30 12:28:59 +02:00
|
|
|
"""Allow to retrieve all images of an html page"""
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
return soup.find_all('img')
|
|
|
|
|
2022-08-31 21:05:20 +02:00
|
|
|
def __find_img_link(self, img: Tag) -> str:
|
2022-08-30 12:28:59 +02:00
|
|
|
"""Allow to retrieve the link of a picture"""
|
|
|
|
if img.get('src'):
|
|
|
|
link = img.get('src')
|
|
|
|
elif img.get('data-src'):
|
|
|
|
link = img.get('data-src')
|
|
|
|
elif img.get('data-srcset'):
|
|
|
|
link = img.get('data-srcset')
|
|
|
|
elif img.get('data-fallback-src'):
|
|
|
|
link = img.get('data-fallback-src')
|
|
|
|
else:
|
|
|
|
raise ValueError("Unable to find image url")
|
|
|
|
if not 'http' in link:
|
2022-08-31 23:53:24 +02:00
|
|
|
raise ValueError("Bad image url")
|
2022-08-30 12:28:59 +02:00
|
|
|
return link
|
|
|
|
|
2022-08-31 21:05:20 +02:00
|
|
|
def __find_image_type(self, img_link: str) -> str:
|
2022-08-30 12:28:59 +02:00
|
|
|
"""Allow to retrieve the right image type"""
|
2022-08-31 21:05:20 +02:00
|
|
|
type = img_link.split('.')[-1]
|
2022-08-30 12:28:59 +02:00
|
|
|
if '?' in type:
|
|
|
|
type = type.split('?')[0]
|
|
|
|
return type
|
|
|
|
|
2022-08-31 21:05:20 +02:00
|
|
|
def __download_img(self, url: str, filename: str) -> None:
|
2022-08-30 12:28:59 +02:00
|
|
|
"""Allow to download a picture from internet"""
|
|
|
|
req = request.Request(url, headers=self.headers)
|
2022-08-31 21:05:20 +02:00
|
|
|
raw_img = request.urlopen(req).read()
|
2022-08-30 12:28:59 +02:00
|
|
|
with open(filename, 'wb') as img:
|
2022-08-31 21:05:20 +02:00
|
|
|
img.write(raw_img)
|
2022-08-30 12:28:59 +02:00
|
|
|
|
2022-08-31 21:05:20 +02:00
|
|
|
def __initialize_folder(self, folder_path: str) -> None:
|
2022-08-30 12:28:59 +02:00
|
|
|
"""Init the folder on which put downloaded images"""
|
2022-08-31 21:05:20 +02:00
|
|
|
if not os.path.exists(folder_path):
|
|
|
|
os.mkdir(folder_path)
|
2022-08-30 12:28:59 +02:00
|
|
|
else:
|
|
|
|
raise ValueError("the folder already exists, it may already contain images")
|
|
|
|
|
|
|
|
# Public functions
|
2022-08-31 23:53:24 +02:00
|
|
|
def set_messenger_callback(self, callback) -> None:
|
2022-08-31 21:05:20 +02:00
|
|
|
"""
|
2022-08-31 23:53:24 +02:00
|
|
|
Setter to define the callback function called when new messages arrive.
|
2022-08-31 21:05:20 +02:00
|
|
|
|
|
|
|
:callback: -> the callback function to call when a message event is emited.
|
|
|
|
"""
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger = callback
|
2022-08-31 21:05:20 +02:00
|
|
|
|
2022-08-31 23:53:24 +02:00
|
|
|
def download(self, url: str, folder_name: str) -> bool:
|
2022-08-30 12:28:59 +02:00
|
|
|
"""
|
|
|
|
Start downloading all pictures of a website
|
|
|
|
|
|
|
|
:url: -> The url of the website to annalyse.\n
|
2022-08-31 21:05:20 +02:00
|
|
|
:folder_name: -> The name of the folder in which to upload the photos.
|
2022-08-30 12:28:59 +02:00
|
|
|
"""
|
|
|
|
try:
|
|
|
|
count = 0
|
2022-08-31 21:05:20 +02:00
|
|
|
folder_path = f"{self.path}/{folder_name}/"
|
|
|
|
html = self.__get_html(url)
|
|
|
|
images = self.__find_all_img(html)
|
2022-08-30 12:28:59 +02:00
|
|
|
|
2022-08-31 21:05:20 +02:00
|
|
|
self.__initialize_folder(folder_path)
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger(f"WebPicDownloader found {len(images)} images on the website.")
|
2022-08-30 12:28:59 +02:00
|
|
|
|
|
|
|
for i, img in enumerate(images):
|
|
|
|
try:
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger(f"Start downloading image {i}.")
|
2022-08-31 21:05:20 +02:00
|
|
|
img_link = self.__find_img_link(img)
|
|
|
|
self.__download_img(img_link, f"{folder_path}image-{i}.{self.__find_image_type(img_link)}")
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger(f"Download of image {i}, done!")
|
2022-08-30 12:28:59 +02:00
|
|
|
count += 1
|
|
|
|
except Exception as err:
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger(f"ERROR: Unable to process image {i} -> err[{err}].")
|
2022-08-30 12:28:59 +02:00
|
|
|
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger(f"WebPicDownloader has processed {count} images out of {len(images)}.")
|
|
|
|
return True
|
2022-08-30 12:28:59 +02:00
|
|
|
except HTTPError as err:
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger(f"ERROR: An http error occured -> err[{err}].")
|
2022-08-30 12:28:59 +02:00
|
|
|
except (ValueError, URLError) as err:
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger(f"ERROT: An error occured with the url -> err[{err}].")
|
2022-08-30 12:28:59 +02:00
|
|
|
except Exception as err:
|
2022-08-31 23:53:24 +02:00
|
|
|
self.messenger(f"ERROR: An unknown error occured -> err[{err}]")
|
|
|
|
return False
|
2022-08-30 12:28:59 +02:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
wpd = WebPicDownloader()
|
2022-08-31 23:53:24 +02:00
|
|
|
wpd.set_messenger_callback(lambda msg: print(f"--> {msg}"))
|
2022-08-30 12:28:59 +02:00
|
|
|
while True:
|
|
|
|
url = input("Website URL ? ")
|
|
|
|
name = input("Folder name ? ")
|
|
|
|
wpd.download(url, name)
|
|
|
|
if "n" == input("Do you want to continue [Y/n] ? ").lower():
|
|
|
|
break
|
|
|
|
print("Good bye !")
|