From 49aaa76aff6379fe97cc7e12341849947b5d19fb Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Sun, 8 Aug 2021 23:24:16 +0200 Subject: [PATCH] restructure scrapers, don't run jpl task on dev --- cogs/google.py | 2 +- cogs/tasks.py | 6 ++- functions/football.py | 2 +- functions/scrapers/__init__.py | 0 functions/scrapers/google.py | 38 ++++++++++++++++++ functions/{scraping.py => scrapers/sporza.py} | 40 +------------------ 6 files changed, 46 insertions(+), 42 deletions(-) create mode 100644 functions/scrapers/__init__.py create mode 100644 functions/scrapers/google.py rename functions/{scraping.py => scrapers/sporza.py} (60%) diff --git a/cogs/google.py b/cogs/google.py index 87b1650..446d6d8 100644 --- a/cogs/google.py +++ b/cogs/google.py @@ -2,7 +2,7 @@ import discord from discord.ext import commands from decorators import help from enums.help_categories import Category -from functions.scraping import google_search +from functions.scrapers.google import google_search class Google(commands.Cog): diff --git a/cogs/tasks.py b/cogs/tasks.py index 2c4818a..2dfdbc9 100644 --- a/cogs/tasks.py +++ b/cogs/tasks.py @@ -5,7 +5,7 @@ from enums.numbers import Numbers from functions import timeFormatters from functions.config import config from functions.database import currency, poke, prison, birthdays, stats -from functions.scraping import getMatchweek +from functions.scrapers.sporza import getMatchweek from functions import ufora_notifications import json import random @@ -228,6 +228,10 @@ class Tasks(commands.Cog): """ Task that checks the current JPL matchweek & changes the dict value """ + # Don't run this when testing + if self.client.user.id != int(constants.didierId): + return + matchweek = getMatchweek() if matchweek is None: diff --git a/functions/football.py b/functions/football.py index 64941ad..092a90c 100644 --- a/functions/football.py +++ b/functions/football.py @@ -1,7 +1,7 @@ from enum import Enum from attr import dataclass, field from functions.timeFormatters import fromString -from functions.scraping import getJPLMatches, getJPLTable +from functions.scrapers.sporza import getJPLMatches, getJPLTable from functions.stringFormatters import leadingZero from datetime import datetime import tabulate diff --git a/functions/scrapers/__init__.py b/functions/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/functions/scrapers/google.py b/functions/scrapers/google.py new file mode 100644 index 0000000..4b7aefa --- /dev/null +++ b/functions/scrapers/google.py @@ -0,0 +1,38 @@ +from bs4 import BeautifulSoup +from requests import get +from urllib.parse import urlencode + + +def google_search(query): + """ + Function to get Google search results + """ + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + } + + query = urlencode({"q": query}) + + # Get 20 results in case some of them are None + resp = get("https://www.google.com/search?{}&num=20&hl=en".format(query), headers=headers) + + if resp.status_code != 200: + return None, resp.status_code + + bs = BeautifulSoup(resp.text, "html.parser") + + def getContent(element): + """ + Function to find links & titles in the HTML of a
element + """ + link = element.find("a", href=True) + title = element.find("h3") + + if link is None or title is None: + return None + + return link["href"], title.text + + divs = bs.find_all("div", attrs={"class": "g"}) + + return list(getContent(d) for d in divs), 200 diff --git a/functions/scraping.py b/functions/scrapers/sporza.py similarity index 60% rename from functions/scraping.py rename to functions/scrapers/sporza.py index 7b642ec..22c6cf2 100644 --- a/functions/scraping.py +++ b/functions/scrapers/sporza.py @@ -1,45 +1,7 @@ +from bs4 import BeautifulSoup import re - from requests import get from urllib.parse import urlencode -from bs4 import BeautifulSoup - -# TODO add Football requests in here as well - - -def google_search(query): - """ - Function to get Google search results - """ - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' - } - - query = urlencode({"q": query}) - - # Get 20 results in case some of them are None - resp = get("https://www.google.com/search?{}&num=20&hl=en".format(query), headers=headers) - - if resp.status_code != 200: - return None, resp.status_code - - bs = BeautifulSoup(resp.text, "html.parser") - - def getContent(element): - """ - Function to find links & titles in the HTML of a
element - """ - link = element.find("a", href=True) - title = element.find("h3") - - if link is None or title is None: - return None - - return link["href"], title.text - - divs = bs.find_all("div", attrs={"class": "g"}) - - return list(getContent(d) for d in divs), 200 def getMatchweek():