135 lines
3.6 KiB
Python
135 lines
3.6 KiB
Python
|
from bs4 import BeautifulSoup
|
||
|
import requests
|
||
|
import re
|
||
|
from urllib.parse import urlunsplit
|
||
|
from multiprocessing.dummy import Pool as ThreadPool
|
||
|
|
||
|
|
||
|
DOMAIN = "4kwallpapers.com"
|
||
|
|
||
|
|
||
|
class PageHandler:
|
||
|
def __init__(self, domain, path):
|
||
|
self.domain = domain
|
||
|
self.path = path
|
||
|
|
||
|
@property
|
||
|
def url(self):
|
||
|
return urlunsplit(("https", self.domain, self.path, "", ""))
|
||
|
|
||
|
def get(self, path="", query="", soup=True):
|
||
|
r = requests.get(self.relative_url(path, query))
|
||
|
|
||
|
if r.status_code != 200:
|
||
|
raise requests.exceptions.RequestException()
|
||
|
|
||
|
if soup:
|
||
|
return BeautifulSoup(r.content, "html.parser")
|
||
|
|
||
|
return r
|
||
|
|
||
|
def relative_url(self, path="", query=""):
|
||
|
return urlunsplit(("https", self.domain, "/".join((self.path,
|
||
|
path.strip("/"))),
|
||
|
query, ""))
|
||
|
|
||
|
|
||
|
class Scraper(PageHandler):
|
||
|
def __init__(self, domain=DOMAIN):
|
||
|
super().__init__(domain, "")
|
||
|
|
||
|
self._categories = None
|
||
|
|
||
|
@property
|
||
|
def categories(self):
|
||
|
if not self._categories:
|
||
|
# Get them if not cached
|
||
|
soup = self.get(soup=True)
|
||
|
ul = soup.find("ul", class_="cats")
|
||
|
cats = []
|
||
|
|
||
|
for li in ul.findAll("li"):
|
||
|
anchor = li.find("a")
|
||
|
|
||
|
cats.append(Category(anchor.get_text(), self.domain,
|
||
|
anchor["href"].strip("/")))
|
||
|
|
||
|
self._categories = cats
|
||
|
|
||
|
return self._categories
|
||
|
|
||
|
@property
|
||
|
def images(self):
|
||
|
return sum([cat.images for cat in self.categories], [])
|
||
|
|
||
|
|
||
|
class Category(PageHandler):
|
||
|
def __init__(self, name, domain, path):
|
||
|
super().__init__(domain, path)
|
||
|
|
||
|
self.name = name
|
||
|
self._images = None
|
||
|
|
||
|
@property
|
||
|
def images(self):
|
||
|
if not self._images:
|
||
|
# Get base page
|
||
|
soup = self.get()
|
||
|
|
||
|
# Get how many pages there are
|
||
|
pages_p = soup.find("p", class_="pages")
|
||
|
|
||
|
count = 1
|
||
|
|
||
|
# The paragraph doesn't exist if there's only one page
|
||
|
if pages_p:
|
||
|
anchors = pages_p.findAll("a")
|
||
|
count = max([int(res.group(1))
|
||
|
for anchor in anchors
|
||
|
if (res := re.match("\?page=([0-9]+)",
|
||
|
anchor["href"]))]
|
||
|
)
|
||
|
|
||
|
# Now, we get the URL for every wallpaper's page
|
||
|
pages = [self.relative_url(query="page={}".format(i))
|
||
|
for i in range(1, count + 1)]
|
||
|
|
||
|
self._images = []
|
||
|
for i in range(1, count + 1):
|
||
|
soup = self.get(query="page={}".format(i))
|
||
|
|
||
|
pics_list = soup.find("div", id="pics-list")
|
||
|
self._images.extend(
|
||
|
[Image(self.domain, anchor["href"])
|
||
|
for anchor in pics_list.findAll("a",class_="wallpapers__canvas_image")])
|
||
|
|
||
|
return self._images
|
||
|
|
||
|
|
||
|
class Image(PageHandler):
|
||
|
def __init__(self, domain, path):
|
||
|
super().__init__(domain, path)
|
||
|
|
||
|
self._image_url = None
|
||
|
self._original_image_url = None
|
||
|
|
||
|
def _get_urls(self):
|
||
|
soup = self.get()
|
||
|
|
||
|
self._image_url = soup.find("a", id="resolution")["href"]
|
||
|
self._original_image_url = soup.find("a", class_="original")["href"]
|
||
|
|
||
|
@property
|
||
|
def image_url(self):
|
||
|
if not self._image_url:
|
||
|
self._get_urls()
|
||
|
|
||
|
return self._image_url
|
||
|
|
||
|
@property
|
||
|
def original_image_url(self):
|
||
|
if not self._original_image_url:
|
||
|
self._get_urls()
|
||
|
|
||
|
return self._original_image_url
|