From 70d2d0de101c18a5034890e964310df1611c55dc Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Tue, 19 Jan 2021 11:50:43 +0100 Subject: [PATCH] Initial commit --- .gitignore | 1 + __main__.py | 6 +++ scraper.py | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 .gitignore create mode 100644 __main__.py create mode 100644 scraper.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/__main__.py b/__main__.py new file mode 100644 index 0000000..d0136aa --- /dev/null +++ b/__main__.py @@ -0,0 +1,6 @@ +from scraper import Scraper, Category + + +if __name__ == "__main__": + scraper = Scraper() + print(scraper.images) diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..758852e --- /dev/null +++ b/scraper.py @@ -0,0 +1,134 @@ +from bs4 import BeautifulSoup +import requests +import re +from urllib.parse import urlunsplit +from multiprocessing.dummy import Pool as ThreadPool + + +DOMAIN = "4kwallpapers.com" + + +class PageHandler: + def __init__(self, domain, path): + self.domain = domain + self.path = path + + @property + def url(self): + return urlunsplit(("https", self.domain, self.path, "", "")) + + def get(self, path="", query="", soup=True): + r = requests.get(self.relative_url(path, query)) + + if r.status_code != 200: + raise requests.exceptions.RequestException() + + if soup: + return BeautifulSoup(r.content, "html.parser") + + return r + + def relative_url(self, path="", query=""): + return urlunsplit(("https", self.domain, "/".join((self.path, + path.strip("/"))), + query, "")) + + +class Scraper(PageHandler): + def __init__(self, domain=DOMAIN): + super().__init__(domain, "") + + self._categories = None + + @property + def categories(self): + if not self._categories: + # Get them if not cached + soup = self.get(soup=True) + ul = soup.find("ul", class_="cats") + cats = [] + + for li in ul.findAll("li"): + anchor = li.find("a") + + cats.append(Category(anchor.get_text(), self.domain, + anchor["href"].strip("/"))) + + self._categories = cats + + return self._categories + + @property + def images(self): + return sum([cat.images for cat in self.categories], []) + + +class Category(PageHandler): + def __init__(self, name, domain, path): + super().__init__(domain, path) + + self.name = name + self._images = None + + @property + def images(self): + if not self._images: + # Get base page + soup = self.get() + + # Get how many pages there are + pages_p = soup.find("p", class_="pages") + + count = 1 + + # The paragraph doesn't exist if there's only one page + if pages_p: + anchors = pages_p.findAll("a") + count = max([int(res.group(1)) + for anchor in anchors + if (res := re.match("\?page=([0-9]+)", + anchor["href"]))] + ) + + # Now, we get the URL for every wallpaper's page + pages = [self.relative_url(query="page={}".format(i)) + for i in range(1, count + 1)] + + self._images = [] + for i in range(1, count + 1): + soup = self.get(query="page={}".format(i)) + + pics_list = soup.find("div", id="pics-list") + self._images.extend( + [Image(self.domain, anchor["href"]) + for anchor in pics_list.findAll("a",class_="wallpapers__canvas_image")]) + + return self._images + + +class Image(PageHandler): + def __init__(self, domain, path): + super().__init__(domain, path) + + self._image_url = None + self._original_image_url = None + + def _get_urls(self): + soup = self.get() + + self._image_url = soup.find("a", id="resolution")["href"] + self._original_image_url = soup.find("a", class_="original")["href"] + + @property + def image_url(self): + if not self._image_url: + self._get_urls() + + return self._image_url + + @property + def original_image_url(self): + if not self._original_image_url: + self._get_urls() + + return self._original_image_url