Almost fully works now

master
Jef Roosens 2021-01-19 18:01:52 +01:00
parent 9314fe8985
commit 8541eb65aa
2 changed files with 34 additions and 18 deletions

View File

@ -1,6 +1,7 @@
from scraper import Scraper, Category from scraper import Scraper, Category, Image
import sys
if __name__ == "__main__": if __name__ == "__main__":
scraper = Scraper() scraper = Scraper()
print(scraper.images) scraper.download(sys.argv[1])

View File

@ -3,6 +3,8 @@ import requests
import re import re
from urllib.parse import urlunsplit from urllib.parse import urlunsplit
from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.dummy import Pool as ThreadPool
import os
import shutil
DOMAIN = "4kwallpapers.com" DOMAIN = "4kwallpapers.com"
@ -18,7 +20,7 @@ class PageHandler:
return urlunsplit(("https", self.domain, self.path, "", "")) return urlunsplit(("https", self.domain, self.path, "", ""))
def get(self, path="", query="", soup=True): def get(self, path="", query="", soup=True):
r = requests.get(self.relative_url(path, query)) r = requests.get(self.relative_url(path, query), allow_redirects=True)
if r.status_code != 200: if r.status_code != 200:
raise requests.exceptions.RequestException() raise requests.exceptions.RequestException()
@ -31,7 +33,7 @@ class PageHandler:
def relative_url(self, path="", query=""): def relative_url(self, path="", query=""):
return urlunsplit(("https", self.domain, "/".join((self.path, return urlunsplit(("https", self.domain, "/".join((self.path,
path.strip("/"))), path.strip("/"))),
query, "")) query, "")).strip("/")
class Scraper(PageHandler): class Scraper(PageHandler):
@ -62,6 +64,10 @@ class Scraper(PageHandler):
def images(self): def images(self):
return sum([cat.images for cat in self.categories], []) return sum([cat.images for cat in self.categories], [])
def download(self, dir_path):
for cat in self.categories:
cat.download(dir_path)
class Category(PageHandler): class Category(PageHandler):
def __init__(self, name, domain, path): def __init__(self, name, domain, path):
@ -69,6 +75,7 @@ class Category(PageHandler):
self.name = name self.name = name
self._images = None self._images = None
self._pool = ThreadPool(50)
@property @property
def images(self): def images(self):
@ -94,30 +101,38 @@ class Category(PageHandler):
pages = [self.relative_url(query="page={}".format(i)) pages = [self.relative_url(query="page={}".format(i))
for i in range(1, count + 1)] for i in range(1, count + 1)]
self._images = [] def helper(page_num):
for i in range(1, count + 1): soup = self.get(query="page={}".format(page_num))
soup = self.get(query="page={}".format(i))
pics_list = soup.find("div", id="pics-list") pics_list = soup.find("div", id="pics-list")
self._images.extend(
[Image(self.domain, anchor["href"]) return [Image(self.domain, anchor["href"])
for anchor in pics_list.findAll("a",class_="wallpapers__canvas_image")]) for anchor in pics_list.findAll("a", class_="wallpapers__canvas_image")]
self._images = sum(self._pool.map(helper, range(1, count + 1)), [])
return self._images return self._images
def download(self, dir_path):
dir_path = os.path.join(dir_path, self.name)
os.makedirs(dir_path, exist_ok=True)
self._pool.map(lambda image: image.download(dir_path), self.images)
class Image(PageHandler): class Image(PageHandler):
def __init__(self, domain, path): def __init__(self, domain, path):
super().__init__(domain, path) super().__init__(domain, path)
self._image_url = None self._image_url = None
self._original_image_url = None
def _get_urls(self): def _get_urls(self):
soup = self.get() soup = self.get()
self._image_url = soup.find("a", id="resolution")["href"] self._image_url = urlunsplit(("https", self.domain,
self._original_image_url = soup.find("a", class_="original")["href"] soup.find("a",
class_="original")["href"].strip("/"),
"", ""))
@property @property
def image_url(self): def image_url(self):
@ -126,9 +141,9 @@ class Image(PageHandler):
return self._image_url return self._image_url
@property def download(self, dir_path):
def original_image_url(self): filename = os.path.basename(self.image_url)
if not self._original_image_url:
self._get_urls()
return self._original_image_url with requests.get(self.image_url, stream=True) as r:
with open(os.path.join(dir_path, filename), "wb") as f:
shutil.copyfileobj(r.raw, f)