118 lines
4.3 KiB
Python
118 lines
4.3 KiB
Python
import os
|
|
from urllib import request
|
|
from urllib.error import HTTPError, URLError
|
|
from bs4 import BeautifulSoup, Tag, ResultSet
|
|
|
|
class WebPicDownloader():
|
|
"""
|
|
WebPicDownloader
|
|
|
|
webpicdownloader is a simple tool able to
|
|
find and download all pictures on a webpage.
|
|
|
|
@author EndMove <contact@endmove.eu>
|
|
@version 1.0.0
|
|
"""
|
|
# Variables
|
|
path: str
|
|
headers: dict = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"}
|
|
|
|
# Constructor
|
|
def __init__(self, path: str = os.getcwd()) -> None:
|
|
"""Constructor"""
|
|
self.path = path
|
|
|
|
# Internal functions
|
|
def __getHtml(self, url: str) -> str:
|
|
"""Allow to retrieve the HTML content of a website"""
|
|
req = request.Request(url, headers=self.headers)
|
|
response = request.urlopen(req)
|
|
return response.read().decode('utf-8')
|
|
|
|
def __findAllImg(self, html: str) -> ResultSet:
|
|
"""Allow to retrieve all images of an html page"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
return soup.find_all('img')
|
|
|
|
def __findImgLink(self, img: Tag) -> str:
|
|
"""Allow to retrieve the link of a picture"""
|
|
if img.get('src'):
|
|
link = img.get('src')
|
|
elif img.get('data-src'):
|
|
link = img.get('data-src')
|
|
elif img.get('data-srcset'):
|
|
link = img.get('data-srcset')
|
|
elif img.get('data-fallback-src'):
|
|
link = img.get('data-fallback-src')
|
|
else:
|
|
raise ValueError("Unable to find image url")
|
|
if not 'http' in link:
|
|
raise ValueError("Bad image link")
|
|
return link
|
|
|
|
def __findImageType(self, imgLink: str) -> str:
|
|
"""Allow to retrieve the right image type"""
|
|
type = imgLink.split('.')[-1]
|
|
if '?' in type:
|
|
type = type.split('?')[0]
|
|
return type
|
|
|
|
def __downloadImg(self, url: str, filename: str) -> None:
|
|
"""Allow to download a picture from internet"""
|
|
req = request.Request(url, headers=self.headers)
|
|
rawImg = request.urlopen(req).read()
|
|
with open(filename, 'wb') as img:
|
|
img.write(rawImg)
|
|
|
|
def __initializeFolder(self, folderPath: str) -> None:
|
|
"""Init the folder on which put downloaded images"""
|
|
if not os.path.exists(folderPath):
|
|
os.mkdir(folderPath)
|
|
else:
|
|
raise ValueError("the folder already exists, it may already contain images")
|
|
|
|
# Public functions
|
|
def download(self, url: str, folderName: str) -> None:
|
|
"""
|
|
Start downloading all pictures of a website
|
|
|
|
:url: -> The url of the website to annalyse.\n
|
|
:folderName: -> The name of the folder in which to upload the photos.
|
|
"""
|
|
try:
|
|
count = 0
|
|
folderPath = f"{self.path}/{folderName}/"
|
|
html = self.__getHtml(url)
|
|
images = self.__findAllImg(html)
|
|
|
|
self.__initializeFolder(folderPath)
|
|
print(f"\nWebPicDownload found {len(images)} images on the website.")
|
|
|
|
for i, img in enumerate(images):
|
|
try:
|
|
imgLink = self.__findImgLink(img)
|
|
self.__downloadImg(imgLink, f"{folderPath}image-{i}.{self.__findImageType(imgLink)}")
|
|
print(f"SUCCESS: File n°{i} successfuly downloaded.")
|
|
count += 1
|
|
except ValueError as err:
|
|
print(f"ERROR: Unable to process image n°{i} -> [{err}].")
|
|
except Exception as err:
|
|
print(f"ERROR: Unable to process image n°{i}, an unknown error occured -> [{err}].")
|
|
|
|
print(f"WebPicDownloader has processed {count} images out of {len(images)}.")
|
|
except HTTPError as err:
|
|
print(f"ERROR: An http error occured -> [{err}].")
|
|
except (ValueError, URLError) as err:
|
|
print(f"ERROT: An error occured with the url -> [{err}].")
|
|
except Exception as err:
|
|
print(f"ERROR: An unknown error occured -> [{err}]")
|
|
|
|
if __name__ == "__main__":
|
|
wpd = WebPicDownloader()
|
|
while True:
|
|
url = input("Website URL ? ")
|
|
name = input("Folder name ? ")
|
|
wpd.download(url, name)
|
|
if "n" == input("Do you want to continue [Y/n] ? ").lower():
|
|
break
|
|
print("Good bye !") |