from bs4 import BeautifulSoup import requests import re from urllib.parse import urlunsplit from multiprocessing.dummy import Pool as ThreadPool DOMAIN = "4kwallpapers.com" class PageHandler: def __init__(self, domain, path): self.domain = domain self.path = path @property def url(self): return urlunsplit(("https", self.domain, self.path, "", "")) def get(self, path="", query="", soup=True): r = requests.get(self.relative_url(path, query)) if r.status_code != 200: raise requests.exceptions.RequestException() if soup: return BeautifulSoup(r.content, "html.parser") return r def relative_url(self, path="", query=""): return urlunsplit(("https", self.domain, "/".join((self.path, path.strip("/"))), query, "")) class Scraper(PageHandler): def __init__(self, domain=DOMAIN): super().__init__(domain, "") self._categories = None @property def categories(self): if not self._categories: # Get them if not cached soup = self.get(soup=True) ul = soup.find("ul", class_="cats") cats = [] for li in ul.findAll("li"): anchor = li.find("a") cats.append(Category(anchor.get_text(), self.domain, anchor["href"].strip("/"))) self._categories = cats return self._categories @property def images(self): return sum([cat.images for cat in self.categories], []) class Category(PageHandler): def __init__(self, name, domain, path): super().__init__(domain, path) self.name = name self._images = None @property def images(self): if not self._images: # Get base page soup = self.get() # Get how many pages there are pages_p = soup.find("p", class_="pages") count = 1 # The paragraph doesn't exist if there's only one page if pages_p: anchors = pages_p.findAll("a") count = max([int(res.group(1)) for anchor in anchors if (res := re.match("\?page=([0-9]+)", anchor["href"]))] ) # Now, we get the URL for every wallpaper's page pages = [self.relative_url(query="page={}".format(i)) for i in range(1, count + 1)] self._images = [] for i in range(1, count + 1): soup = self.get(query="page={}".format(i)) pics_list = soup.find("div", id="pics-list") self._images.extend( [Image(self.domain, anchor["href"]) for anchor in pics_list.findAll("a",class_="wallpapers__canvas_image")]) return self._images class Image(PageHandler): def __init__(self, domain, path): super().__init__(domain, path) self._image_url = None self._original_image_url = None def _get_urls(self): soup = self.get() self._image_url = soup.find("a", id="resolution")["href"] self._original_image_url = soup.find("a", class_="original")["href"] @property def image_url(self): if not self._image_url: self._get_urls() return self._image_url @property def original_image_url(self): if not self._original_image_url: self._get_urls() return self._original_image_url