restructure scrapers, don't run jpl task on dev

2026-06-01 20:06:15 +02:00 · 2021-08-08 23:24:16 +02:00 · 2021-08-08 23:24:16 +02:00 · 49aaa76aff
commit 49aaa76aff
parent e07a2c28d1
6 changed files with 46 additions and 42 deletions
--- a/functions/scrapers/init.py
+++ b/functions/scrapers/init.py
--- a/functions/scrapers/google.py
+++ b/functions/scrapers/google.py
@ -0,0 +1,38 @@
+from bs4 import BeautifulSoup
+from requests import get
+from urllib.parse import urlencode
+
+
+def google_search(query):
+    """
+    Function to get Google search results
+    """
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
+    }
+
+    query = urlencode({"q": query})
+
+    # Get 20 results in case some of them are None
+    resp = get("https://www.google.com/search?{}&num=20&hl=en".format(query), headers=headers)
+
+    if resp.status_code != 200:
+        return None, resp.status_code
+
+    bs = BeautifulSoup(resp.text, "html.parser")
+
+    def getContent(element):
+        """
+        Function to find links & titles in the HTML of a <div> element
+        """
+        link = element.find("a", href=True)
+        title = element.find("h3")
+
+        if link is None or title is None:
+            return None
+
+        return link["href"], title.text
+
+    divs = bs.find_all("div", attrs={"class": "g"})
+
+    return list(getContent(d) for d in divs), 200
--- a/functions/scrapers/sporza.py
+++ b/functions/scrapers/sporza.py
@ -0,0 +1,63 @@
+from bs4 import BeautifulSoup
+import re
+from requests import get
+from urllib.parse import urlencode
+
+
+def getMatchweek():
+    """
+    Parses the current JPL matchweek out of Sporza's site
+    """
+    resp = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")
+
+    if resp.status_code != 200:
+        return None
+
+    bs = BeautifulSoup(resp.text, "html.parser")
+    matchdays = bs.find_all("section", attrs={"class": "sc-matchdays"})
+
+    if len(matchdays) == 0:
+        return None
+
+    # Table header
+    header = matchdays[0]
+
+    # Regex to find current matchday
+    r = re.compile(r"speeldag\s*\d+", flags=re.I)
+
+    match = r.search(str(header))
+
+    # Something went wrong, just ignore
+    if match is None:
+        return None
+
+    # "Speeldag DD" -> split on space & take second
+    return match[0].split(" ")[1]
+
+
+def getJPLMatches(week: int):
+    """
+    JPL matches for a given matchweek
+    """
+    current_day = get("https://api.sporza.be/web/soccer/matchdays/161733/{}".format(week))
+
+    # Something went wrong
+    if current_day.status_code != 200:
+        return None
+
+    return current_day.json()["groupedMatches"][0]["matches"]
+
+
+def getJPLTable():
+    """
+    JPL table
+    """
+    page_html = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")
+
+    # Something went wrong
+    if page_html.status_code != 200:
+        return None
+
+    bs_parsed = BeautifulSoup(page_html.text, "html.parser")
+    rows = bs_parsed.find(summary="algemeen klassement").find_all("tr")[1:]
+    return rows