From 8541eb65aa98841f501c4a01db128d7212eb6c6a Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Tue, 19 Jan 2021 18:01:52 +0100 Subject: [PATCH] Almost fully works now --- __main__.py | 5 +++-- scraper.py | 47 +++++++++++++++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/__main__.py b/__main__.py index d0136aa..eaa1dca 100644 --- a/__main__.py +++ b/__main__.py @@ -1,6 +1,7 @@ -from scraper import Scraper, Category +from scraper import Scraper, Category, Image +import sys if __name__ == "__main__": scraper = Scraper() - print(scraper.images) + scraper.download(sys.argv[1]) diff --git a/scraper.py b/scraper.py index 758852e..5af1448 100644 --- a/scraper.py +++ b/scraper.py @@ -3,6 +3,8 @@ import requests import re from urllib.parse import urlunsplit from multiprocessing.dummy import Pool as ThreadPool +import os +import shutil DOMAIN = "4kwallpapers.com" @@ -18,7 +20,7 @@ class PageHandler: return urlunsplit(("https", self.domain, self.path, "", "")) def get(self, path="", query="", soup=True): - r = requests.get(self.relative_url(path, query)) + r = requests.get(self.relative_url(path, query), allow_redirects=True) if r.status_code != 200: raise requests.exceptions.RequestException() @@ -31,7 +33,7 @@ class PageHandler: def relative_url(self, path="", query=""): return urlunsplit(("https", self.domain, "/".join((self.path, path.strip("/"))), - query, "")) + query, "")).strip("/") class Scraper(PageHandler): @@ -62,6 +64,10 @@ class Scraper(PageHandler): def images(self): return sum([cat.images for cat in self.categories], []) + def download(self, dir_path): + for cat in self.categories: + cat.download(dir_path) + class Category(PageHandler): def __init__(self, name, domain, path): @@ -69,6 +75,7 @@ class Category(PageHandler): self.name = name self._images = None + self._pool = ThreadPool(50) @property def images(self): @@ -94,30 +101,38 @@ class Category(PageHandler): pages = [self.relative_url(query="page={}".format(i)) for i in range(1, count + 1)] - self._images = [] - for i in range(1, count + 1): - soup = self.get(query="page={}".format(i)) + def helper(page_num): + soup = self.get(query="page={}".format(page_num)) pics_list = soup.find("div", id="pics-list") - self._images.extend( - [Image(self.domain, anchor["href"]) - for anchor in pics_list.findAll("a",class_="wallpapers__canvas_image")]) + + return [Image(self.domain, anchor["href"]) + for anchor in pics_list.findAll("a", class_="wallpapers__canvas_image")] + + self._images = sum(self._pool.map(helper, range(1, count + 1)), []) return self._images + def download(self, dir_path): + dir_path = os.path.join(dir_path, self.name) + + os.makedirs(dir_path, exist_ok=True) + + self._pool.map(lambda image: image.download(dir_path), self.images) class Image(PageHandler): def __init__(self, domain, path): super().__init__(domain, path) self._image_url = None - self._original_image_url = None def _get_urls(self): soup = self.get() - self._image_url = soup.find("a", id="resolution")["href"] - self._original_image_url = soup.find("a", class_="original")["href"] + self._image_url = urlunsplit(("https", self.domain, + soup.find("a", + class_="original")["href"].strip("/"), + "", "")) @property def image_url(self): @@ -126,9 +141,9 @@ class Image(PageHandler): return self._image_url - @property - def original_image_url(self): - if not self._original_image_url: - self._get_urls() + def download(self, dir_path): + filename = os.path.basename(self.image_url) - return self._original_image_url + with requests.get(self.image_url, stream=True) as r: + with open(os.path.join(dir_path, filename), "wb") as f: + shutil.copyfileobj(r.raw, f)