Initial commit

master
Jef Roosens 2021-01-19 11:50:43 +01:00
commit 70d2d0de10
3 changed files with 141 additions and 0 deletions

1
.gitignore vendored 100644
View File

@ -0,0 +1 @@
__pycache__/

6
__main__.py 100644
View File

@ -0,0 +1,6 @@
from scraper import Scraper, Category
if __name__ == "__main__":
scraper = Scraper()
print(scraper.images)

134
scraper.py 100644
View File

@ -0,0 +1,134 @@
from bs4 import BeautifulSoup
import requests
import re
from urllib.parse import urlunsplit
from multiprocessing.dummy import Pool as ThreadPool
DOMAIN = "4kwallpapers.com"
class PageHandler:
def __init__(self, domain, path):
self.domain = domain
self.path = path
@property
def url(self):
return urlunsplit(("https", self.domain, self.path, "", ""))
def get(self, path="", query="", soup=True):
r = requests.get(self.relative_url(path, query))
if r.status_code != 200:
raise requests.exceptions.RequestException()
if soup:
return BeautifulSoup(r.content, "html.parser")
return r
def relative_url(self, path="", query=""):
return urlunsplit(("https", self.domain, "/".join((self.path,
path.strip("/"))),
query, ""))
class Scraper(PageHandler):
def __init__(self, domain=DOMAIN):
super().__init__(domain, "")
self._categories = None
@property
def categories(self):
if not self._categories:
# Get them if not cached
soup = self.get(soup=True)
ul = soup.find("ul", class_="cats")
cats = []
for li in ul.findAll("li"):
anchor = li.find("a")
cats.append(Category(anchor.get_text(), self.domain,
anchor["href"].strip("/")))
self._categories = cats
return self._categories
@property
def images(self):
return sum([cat.images for cat in self.categories], [])
class Category(PageHandler):
def __init__(self, name, domain, path):
super().__init__(domain, path)
self.name = name
self._images = None
@property
def images(self):
if not self._images:
# Get base page
soup = self.get()
# Get how many pages there are
pages_p = soup.find("p", class_="pages")
count = 1
# The paragraph doesn't exist if there's only one page
if pages_p:
anchors = pages_p.findAll("a")
count = max([int(res.group(1))
for anchor in anchors
if (res := re.match("\?page=([0-9]+)",
anchor["href"]))]
)
# Now, we get the URL for every wallpaper's page
pages = [self.relative_url(query="page={}".format(i))
for i in range(1, count + 1)]
self._images = []
for i in range(1, count + 1):
soup = self.get(query="page={}".format(i))
pics_list = soup.find("div", id="pics-list")
self._images.extend(
[Image(self.domain, anchor["href"])
for anchor in pics_list.findAll("a",class_="wallpapers__canvas_image")])
return self._images
class Image(PageHandler):
def __init__(self, domain, path):
super().__init__(domain, path)
self._image_url = None
self._original_image_url = None
def _get_urls(self):
soup = self.get()
self._image_url = soup.find("a", id="resolution")["href"]
self._original_image_url = soup.find("a", class_="original")["href"]
@property
def image_url(self):
if not self._image_url:
self._get_urls()
return self._image_url
@property
def original_image_url(self):
if not self._original_image_url:
self._get_urls()
return self._original_image_url