didier/functions/scrapers/sporza.py

from bs4 import BeautifulSoup
from functions import config
import re
from requests import get


def getMatchweek():
    """
    Parses the current JPL matchweek out of Sporza's site
    """
    resp = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")

    if resp.status_code != 200:
        return None

    bs = BeautifulSoup(resp.text, "html.parser")
    matchdays = bs.find_all("section", attrs={"class": "sc-matchdays"})

    if len(matchdays) == 0:
        return None

    # Table header
    header = matchdays[0]

    # Regex to find current matchday
    r = re.compile(r"speeldag\s*\d+", flags=re.I)

    match = r.search(str(header))

    # Something went wrong, just ignore
    if match is None:
        return None

    # "Speeldag DD" -> split on space & take second
    return match[0].split(" ")[1]


def getJPLMatches(week: int):
    """
    JPL matches for a given matchweek
    """
    jpl = config.get("jpl")
    current_day = get(f"https://api.sporza.be/web/soccer/phases/{jpl}/matchdays/{week}")

    # Something went wrong
    if current_day.status_code != 200:
        return None

    return current_day.json()["groupedMatches"][0]["matches"]


def getJPLTable():
    """
    JPL table
    """
    page_html = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")

    # Something went wrong
    if page_html.status_code != 200:
        return None

    bs_parsed = BeautifulSoup(page_html.text, "html.parser")
    rows = bs_parsed.find(summary="algemeen klassement").find_all("tr")[1:]
    return rows
restructure scrapers, don't run jpl task on dev 2021-08-08 23:24:16 +02:00			`from bs4 import BeautifulSoup`
Fix matches url 2021-08-19 20:47:53 +02:00			`from functions import config`
Scrape current jpl matchweek every few hours 2021-01-24 22:31:09 +01:00			`import re`
Start working on Google Search (#18), add support for Sell All instead of erroring (fixes #29) 2021-01-23 23:40:49 +01:00			`from requests import get`
Fix incorrect sorting in stats ca 2021-01-24 22:00:24 +01:00

			`def getMatchweek():`
			`"""`
			`Parses the current JPL matchweek out of Sporza's site`
			`"""`
Scrape current jpl matchweek every few hours 2021-01-24 22:31:09 +01:00			`resp = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")`

			`if resp.status_code != 200:`
			`return None`

			`bs = BeautifulSoup(resp.text, "html.parser")`
			`matchdays = bs.find_all("section", attrs={"class": "sc-matchdays"})`

Fix sporza matchday scraper 2021-08-08 23:06:59 +02:00			`if len(matchdays) == 0:`
Scrape current jpl matchweek every few hours 2021-01-24 22:31:09 +01:00			`return None`

			`# Table header`
Fix sporza matchday scraper 2021-08-08 23:06:59 +02:00			`header = matchdays[0]`
Scrape current jpl matchweek every few hours 2021-01-24 22:31:09 +01:00
			`# Regex to find current matchday`
			`r = re.compile(r"speeldag\s*\d+", flags=re.I)`

			`match = r.search(str(header))`

			`# Something went wrong, just ignore`
			`if match is None:`
			`return None`

			`# "Speeldag DD" -> split on space & take second`
			`return match[0].split(" ")[1]`
Clean up Football matches a lot 2021-01-25 00:16:38 +01:00

			`def getJPLMatches(week: int):`
			`"""`
			`JPL matches for a given matchweek`
			`"""`
Fix matches url 2021-08-19 20:47:53 +02:00			`jpl = config.get("jpl")`
			`current_day = get(f"https://api.sporza.be/web/soccer/phases/{jpl}/matchdays/{week}")`
Clean up Football matches a lot 2021-01-25 00:16:38 +01:00
			`# Something went wrong`
			`if current_day.status_code != 200:`
			`return None`

			`return current_day.json()["groupedMatches"][0]["matches"]`
Clean up jpl table 2021-01-26 21:58:49 +01:00

			`def getJPLTable():`
			`"""`
			`JPL table`
			`"""`
			`page_html = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")`

Add comments 2021-01-26 22:00:20 +01:00			`# Something went wrong`
Clean up jpl table 2021-01-26 21:58:49 +01:00			`if page_html.status_code != 200:`
			`return None`

			`bs_parsed = BeautifulSoup(page_html.text, "html.parser")`
			`rows = bs_parsed.find(summary="algemeen klassement").find_all("tr")[1:]`
			`return rows`