Almost fully works now
							parent
							
								
									9314fe8985
								
							
						
					
					
						commit
						8541eb65aa
					
				| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
from scraper import Scraper, Category
 | 
			
		||||
from scraper import Scraper, Category, Image
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    scraper = Scraper()
 | 
			
		||||
    print(scraper.images)
 | 
			
		||||
    scraper.download(sys.argv[1])
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										47
									
								
								scraper.py
								
								
								
								
							
							
						
						
									
										47
									
								
								scraper.py
								
								
								
								
							| 
						 | 
				
			
			@ -3,6 +3,8 @@ import requests
 | 
			
		|||
import re
 | 
			
		||||
from urllib.parse import urlunsplit
 | 
			
		||||
from multiprocessing.dummy import Pool as ThreadPool
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DOMAIN = "4kwallpapers.com"
 | 
			
		||||
| 
						 | 
				
			
			@ -18,7 +20,7 @@ class PageHandler:
 | 
			
		|||
        return urlunsplit(("https", self.domain, self.path, "", ""))
 | 
			
		||||
 | 
			
		||||
    def get(self, path="", query="", soup=True):
 | 
			
		||||
        r = requests.get(self.relative_url(path, query))
 | 
			
		||||
        r = requests.get(self.relative_url(path, query), allow_redirects=True)
 | 
			
		||||
 | 
			
		||||
        if r.status_code != 200:
 | 
			
		||||
            raise requests.exceptions.RequestException()
 | 
			
		||||
| 
						 | 
				
			
			@ -31,7 +33,7 @@ class PageHandler:
 | 
			
		|||
    def relative_url(self, path="", query=""):
 | 
			
		||||
        return urlunsplit(("https", self.domain, "/".join((self.path,
 | 
			
		||||
                                                        path.strip("/"))),
 | 
			
		||||
                          query, ""))
 | 
			
		||||
                          query, "")).strip("/")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Scraper(PageHandler):
 | 
			
		||||
| 
						 | 
				
			
			@ -62,6 +64,10 @@ class Scraper(PageHandler):
 | 
			
		|||
    def images(self):
 | 
			
		||||
        return sum([cat.images for cat in self.categories], [])
 | 
			
		||||
 | 
			
		||||
    def download(self, dir_path):
 | 
			
		||||
        for cat in self.categories:
 | 
			
		||||
            cat.download(dir_path)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Category(PageHandler):
 | 
			
		||||
    def __init__(self, name, domain, path):
 | 
			
		||||
| 
						 | 
				
			
			@ -69,6 +75,7 @@ class Category(PageHandler):
 | 
			
		|||
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self._images = None
 | 
			
		||||
        self._pool = ThreadPool(50)
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def images(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -94,30 +101,38 @@ class Category(PageHandler):
 | 
			
		|||
            pages = [self.relative_url(query="page={}".format(i))
 | 
			
		||||
                     for i in range(1, count + 1)]
 | 
			
		||||
 | 
			
		||||
            self._images = []
 | 
			
		||||
            for i in range(1, count + 1):
 | 
			
		||||
                soup = self.get(query="page={}".format(i))
 | 
			
		||||
            def helper(page_num):
 | 
			
		||||
                soup = self.get(query="page={}".format(page_num))
 | 
			
		||||
 | 
			
		||||
                pics_list = soup.find("div", id="pics-list")
 | 
			
		||||
                self._images.extend(
 | 
			
		||||
                    [Image(self.domain, anchor["href"])
 | 
			
		||||
                     for anchor in pics_list.findAll("a",class_="wallpapers__canvas_image")])
 | 
			
		||||
 | 
			
		||||
                return [Image(self.domain, anchor["href"])
 | 
			
		||||
                        for anchor in pics_list.findAll("a", class_="wallpapers__canvas_image")]
 | 
			
		||||
 | 
			
		||||
            self._images = sum(self._pool.map(helper, range(1, count + 1)), [])
 | 
			
		||||
 | 
			
		||||
        return self._images
 | 
			
		||||
 | 
			
		||||
    def download(self, dir_path):
 | 
			
		||||
        dir_path = os.path.join(dir_path, self.name)
 | 
			
		||||
 | 
			
		||||
        os.makedirs(dir_path, exist_ok=True)
 | 
			
		||||
 | 
			
		||||
        self._pool.map(lambda image: image.download(dir_path), self.images)
 | 
			
		||||
 | 
			
		||||
class Image(PageHandler):
 | 
			
		||||
    def __init__(self, domain, path):
 | 
			
		||||
        super().__init__(domain, path)
 | 
			
		||||
 | 
			
		||||
        self._image_url = None
 | 
			
		||||
        self._original_image_url = None
 | 
			
		||||
 | 
			
		||||
    def _get_urls(self):
 | 
			
		||||
        soup = self.get()
 | 
			
		||||
 | 
			
		||||
        self._image_url = soup.find("a", id="resolution")["href"]
 | 
			
		||||
        self._original_image_url = soup.find("a", class_="original")["href"]
 | 
			
		||||
        self._image_url = urlunsplit(("https", self.domain,
 | 
			
		||||
                                     soup.find("a",
 | 
			
		||||
                                               class_="original")["href"].strip("/"),
 | 
			
		||||
                                     "", ""))
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def image_url(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -126,9 +141,9 @@ class Image(PageHandler):
 | 
			
		|||
 | 
			
		||||
        return self._image_url
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def original_image_url(self):
 | 
			
		||||
        if not self._original_image_url:
 | 
			
		||||
            self._get_urls()
 | 
			
		||||
    def download(self, dir_path):
 | 
			
		||||
        filename = os.path.basename(self.image_url)
 | 
			
		||||
 | 
			
		||||
        return self._original_image_url
 | 
			
		||||
        with requests.get(self.image_url, stream=True) as r:
 | 
			
		||||
            with open(os.path.join(dir_path, filename), "wb") as f:
 | 
			
		||||
                shutil.copyfileobj(r.raw, f)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue