didier/functions/scraping.py

92 lines
2.2 KiB
Python

import re
from requests import get
from urllib.parse import urlencode
from bs4 import BeautifulSoup
# TODO add Football requests in here as well
def google_search(query):
"""
Function to get Google search results
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
query = urlencode({"q": query})
# Get 20 results in case some of them are None
resp = get("https://www.google.com/search?{}&num=20&hl=en".format(query), headers=headers)
if resp.status_code != 200:
return None, resp.status_code
bs = BeautifulSoup(resp.text, "html.parser")
def getContent(element):
"""
Function to find links & titles in the HTML of a <div> element
"""
link = element.find("a", href=True)
title = element.find("h3")
if link is None or title is None:
return None
sp = title.find("span")
if sp is None:
return None
return link["href"], sp.text
divs = bs.find_all("div", attrs={"class": "g"})
return list(getContent(d) for d in divs), 200
def getMatchweek():
"""
Parses the current JPL matchweek out of Sporza's site
"""
resp = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")
if resp.status_code != 200:
return None
bs = BeautifulSoup(resp.text, "html.parser")
matchdays = bs.find_all("section", attrs={"class": "sc-matchdays"})
if len(matchdays) < 2:
return None
# Table header
header = matchdays[1]
# Regex to find current matchday
r = re.compile(r"speeldag\s*\d+", flags=re.I)
match = r.search(str(header))
# Something went wrong, just ignore
if match is None:
return None
# "Speeldag DD" -> split on space & take second
return match[0].split(" ")[1]
def getJPLMatches(week: int):
"""
JPL matches for a given matchweek
"""
current_day = get("https://api.sporza.be/web/soccer/matchdays/161733/{}".format(week))
# Something went wrong
if current_day.status_code != 200:
return None
return current_day.json()["groupedMatches"][0]["matches"]