Almost fully works now
parent
9314fe8985
commit
8541eb65aa
|
@ -1,6 +1,7 @@
|
||||||
from scraper import Scraper, Category
|
from scraper import Scraper, Category, Image
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
scraper = Scraper()
|
scraper = Scraper()
|
||||||
print(scraper.images)
|
scraper.download(sys.argv[1])
|
||||||
|
|
47
scraper.py
47
scraper.py
|
@ -3,6 +3,8 @@ import requests
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urlunsplit
|
from urllib.parse import urlunsplit
|
||||||
from multiprocessing.dummy import Pool as ThreadPool
|
from multiprocessing.dummy import Pool as ThreadPool
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
DOMAIN = "4kwallpapers.com"
|
DOMAIN = "4kwallpapers.com"
|
||||||
|
@ -18,7 +20,7 @@ class PageHandler:
|
||||||
return urlunsplit(("https", self.domain, self.path, "", ""))
|
return urlunsplit(("https", self.domain, self.path, "", ""))
|
||||||
|
|
||||||
def get(self, path="", query="", soup=True):
|
def get(self, path="", query="", soup=True):
|
||||||
r = requests.get(self.relative_url(path, query))
|
r = requests.get(self.relative_url(path, query), allow_redirects=True)
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise requests.exceptions.RequestException()
|
raise requests.exceptions.RequestException()
|
||||||
|
@ -31,7 +33,7 @@ class PageHandler:
|
||||||
def relative_url(self, path="", query=""):
|
def relative_url(self, path="", query=""):
|
||||||
return urlunsplit(("https", self.domain, "/".join((self.path,
|
return urlunsplit(("https", self.domain, "/".join((self.path,
|
||||||
path.strip("/"))),
|
path.strip("/"))),
|
||||||
query, ""))
|
query, "")).strip("/")
|
||||||
|
|
||||||
|
|
||||||
class Scraper(PageHandler):
|
class Scraper(PageHandler):
|
||||||
|
@ -62,6 +64,10 @@ class Scraper(PageHandler):
|
||||||
def images(self):
|
def images(self):
|
||||||
return sum([cat.images for cat in self.categories], [])
|
return sum([cat.images for cat in self.categories], [])
|
||||||
|
|
||||||
|
def download(self, dir_path):
|
||||||
|
for cat in self.categories:
|
||||||
|
cat.download(dir_path)
|
||||||
|
|
||||||
|
|
||||||
class Category(PageHandler):
|
class Category(PageHandler):
|
||||||
def __init__(self, name, domain, path):
|
def __init__(self, name, domain, path):
|
||||||
|
@ -69,6 +75,7 @@ class Category(PageHandler):
|
||||||
|
|
||||||
self.name = name
|
self.name = name
|
||||||
self._images = None
|
self._images = None
|
||||||
|
self._pool = ThreadPool(50)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def images(self):
|
def images(self):
|
||||||
|
@ -94,30 +101,38 @@ class Category(PageHandler):
|
||||||
pages = [self.relative_url(query="page={}".format(i))
|
pages = [self.relative_url(query="page={}".format(i))
|
||||||
for i in range(1, count + 1)]
|
for i in range(1, count + 1)]
|
||||||
|
|
||||||
self._images = []
|
def helper(page_num):
|
||||||
for i in range(1, count + 1):
|
soup = self.get(query="page={}".format(page_num))
|
||||||
soup = self.get(query="page={}".format(i))
|
|
||||||
|
|
||||||
pics_list = soup.find("div", id="pics-list")
|
pics_list = soup.find("div", id="pics-list")
|
||||||
self._images.extend(
|
|
||||||
[Image(self.domain, anchor["href"])
|
return [Image(self.domain, anchor["href"])
|
||||||
for anchor in pics_list.findAll("a",class_="wallpapers__canvas_image")])
|
for anchor in pics_list.findAll("a", class_="wallpapers__canvas_image")]
|
||||||
|
|
||||||
|
self._images = sum(self._pool.map(helper, range(1, count + 1)), [])
|
||||||
|
|
||||||
return self._images
|
return self._images
|
||||||
|
|
||||||
|
def download(self, dir_path):
|
||||||
|
dir_path = os.path.join(dir_path, self.name)
|
||||||
|
|
||||||
|
os.makedirs(dir_path, exist_ok=True)
|
||||||
|
|
||||||
|
self._pool.map(lambda image: image.download(dir_path), self.images)
|
||||||
|
|
||||||
class Image(PageHandler):
|
class Image(PageHandler):
|
||||||
def __init__(self, domain, path):
|
def __init__(self, domain, path):
|
||||||
super().__init__(domain, path)
|
super().__init__(domain, path)
|
||||||
|
|
||||||
self._image_url = None
|
self._image_url = None
|
||||||
self._original_image_url = None
|
|
||||||
|
|
||||||
def _get_urls(self):
|
def _get_urls(self):
|
||||||
soup = self.get()
|
soup = self.get()
|
||||||
|
|
||||||
self._image_url = soup.find("a", id="resolution")["href"]
|
self._image_url = urlunsplit(("https", self.domain,
|
||||||
self._original_image_url = soup.find("a", class_="original")["href"]
|
soup.find("a",
|
||||||
|
class_="original")["href"].strip("/"),
|
||||||
|
"", ""))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def image_url(self):
|
def image_url(self):
|
||||||
|
@ -126,9 +141,9 @@ class Image(PageHandler):
|
||||||
|
|
||||||
return self._image_url
|
return self._image_url
|
||||||
|
|
||||||
@property
|
def download(self, dir_path):
|
||||||
def original_image_url(self):
|
filename = os.path.basename(self.image_url)
|
||||||
if not self._original_image_url:
|
|
||||||
self._get_urls()
|
|
||||||
|
|
||||||
return self._original_image_url
|
with requests.get(self.image_url, stream=True) as r:
|
||||||
|
with open(os.path.join(dir_path, filename), "wb") as f:
|
||||||
|
shutil.copyfileobj(r.raw, f)
|
||||||
|
|
Loading…
Reference in New Issue