didier/functions/scraping.py

import re

from requests import get
from urllib.parse import urlencode
from bs4 import BeautifulSoup

# TODO add Football requests in here as well


def google_search(query):
    """
    Function to get Google search results
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }

    query = urlencode({"q": query})

    # Get 20 results in case some of them are None
    resp = get("https://www.google.com/search?{}&num=20&hl=en".format(query), headers=headers)

    if resp.status_code != 200:
        return None, resp.status_code

    bs = BeautifulSoup(resp.text, "html.parser")

    def getContent(element):
        """
        Function to find links & titles in the HTML of a <div> element
        """
        link = element.find("a", href=True)
        title = element.find("h3")

        if link is None or title is None:
            return None

        return link["href"], title.text

    divs = bs.find_all("div", attrs={"class": "g"})

    return list(getContent(d) for d in divs), 200


def getMatchweek():
    """
    Parses the current JPL matchweek out of Sporza's site
    """
    resp = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")

    if resp.status_code != 200:
        return None

    bs = BeautifulSoup(resp.text, "html.parser")
    matchdays = bs.find_all("section", attrs={"class": "sc-matchdays"})

    if len(matchdays) == 0:
        return None

    # Table header
    header = matchdays[0]

    # Regex to find current matchday
    r = re.compile(r"speeldag\s*\d+", flags=re.I)

    match = r.search(str(header))

    # Something went wrong, just ignore
    if match is None:
        return None

    # "Speeldag DD" -> split on space & take second
    return match[0].split(" ")[1]


def getJPLMatches(week: int):
    """
    JPL matches for a given matchweek
    """
    current_day = get("https://api.sporza.be/web/soccer/matchdays/161733/{}".format(week))

    # Something went wrong
    if current_day.status_code != 200:
        return None

    return current_day.json()["groupedMatches"][0]["matches"]


def getJPLTable():
    """
    JPL table
    """
    page_html = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")

    # Something went wrong
    if page_html.status_code != 200:
        return None

    bs_parsed = BeautifulSoup(page_html.text, "html.parser")
    rows = bs_parsed.find(summary="algemeen klassement").find_all("tr")[1:]
    return rows
Scrape current jpl matchweek every few hours 2021-01-24 22:31:09 +01:00			`import re`

Start working on Google Search (#18), add support for Sell All instead of erroring (fixes #29) 2021-01-23 23:40:49 +01:00			`from requests import get`
			`from urllib.parse import urlencode`
			`from bs4 import BeautifulSoup`

			`# TODO add Football requests in here as well`


			`def google_search(query):`
			`"""`
			`Function to get Google search results`
			`"""`
			`headers = {`
			`'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'`
			`}`

			`query = urlencode({"q": query})`

Change discord version to 1.6.0, Google search (fixes #18) 2021-01-24 13:08:58 +01:00			`# Get 20 results in case some of them are None`
			`resp = get("https://www.google.com/search?{}&num=20&hl=en".format(query), headers=headers)`

Start working on Google Search (#18), add support for Sell All instead of erroring (fixes #29) 2021-01-23 23:40:49 +01:00			`if resp.status_code != 200:`
			`return None, resp.status_code`

			`bs = BeautifulSoup(resp.text, "html.parser")`

			`def getContent(element):`
Add comments 2021-01-24 13:14:44 +01:00			`"""`
			`Function to find links & titles in the HTML of a <div> element`
			`"""`
Change discord version to 1.6.0, Google search (fixes #18) 2021-01-24 13:08:58 +01:00			`link = element.find("a", href=True)`
			`title = element.find("h3")`
Add comments 2021-01-24 13:14:44 +01:00
Change discord version to 1.6.0, Google search (fixes #18) 2021-01-24 13:08:58 +01:00			`if link is None or title is None:`
			`return None`

Fix google search 2021-04-21 09:44:26 +02:00			`return link["href"], title.text`
Add comments 2021-01-24 13:14:44 +01:00
Change discord version to 1.6.0, Google search (fixes #18) 2021-01-24 13:08:58 +01:00			`divs = bs.find_all("div", attrs={"class": "g"})`
Start working on Google Search (#18), add support for Sell All instead of erroring (fixes #29) 2021-01-23 23:40:49 +01:00
			`return list(getContent(d) for d in divs), 200`
Fix incorrect sorting in stats ca 2021-01-24 22:00:24 +01:00

			`def getMatchweek():`
			`"""`
			`Parses the current JPL matchweek out of Sporza's site`
			`"""`
Scrape current jpl matchweek every few hours 2021-01-24 22:31:09 +01:00			`resp = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")`

			`if resp.status_code != 200:`
			`return None`

			`bs = BeautifulSoup(resp.text, "html.parser")`
			`matchdays = bs.find_all("section", attrs={"class": "sc-matchdays"})`

Fix sporza matchday scraper 2021-08-08 23:06:59 +02:00			`if len(matchdays) == 0:`
Scrape current jpl matchweek every few hours 2021-01-24 22:31:09 +01:00			`return None`

			`# Table header`
Fix sporza matchday scraper 2021-08-08 23:06:59 +02:00			`header = matchdays[0]`
Scrape current jpl matchweek every few hours 2021-01-24 22:31:09 +01:00
			`# Regex to find current matchday`
			`r = re.compile(r"speeldag\s*\d+", flags=re.I)`

			`match = r.search(str(header))`

			`# Something went wrong, just ignore`
			`if match is None:`
			`return None`

			`# "Speeldag DD" -> split on space & take second`
			`return match[0].split(" ")[1]`
Clean up Football matches a lot 2021-01-25 00:16:38 +01:00

			`def getJPLMatches(week: int):`
			`"""`
			`JPL matches for a given matchweek`
			`"""`
			`current_day = get("https://api.sporza.be/web/soccer/matchdays/161733/{}".format(week))`

			`# Something went wrong`
			`if current_day.status_code != 200:`
			`return None`

			`return current_day.json()["groupedMatches"][0]["matches"]`
Clean up jpl table 2021-01-26 21:58:49 +01:00

			`def getJPLTable():`
			`"""`
			`JPL table`
			`"""`
			`page_html = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")`

Add comments 2021-01-26 22:00:20 +01:00			`# Something went wrong`
Clean up jpl table 2021-01-26 21:58:49 +01:00			`if page_html.status_code != 200:`
			`return None`

			`bs_parsed = BeautifulSoup(page_html.text, "html.parser")`
			`rows = bs_parsed.find(summary="algemeen klassement").find_all("tr")[1:]`
			`return rows`