mirror of
https://github.com/stijndcl/didier.git
synced 2026-04-07 15:48:29 +02:00
restructure scrapers, don't run jpl task on dev
This commit is contained in:
parent
e07a2c28d1
commit
49aaa76aff
6 changed files with 46 additions and 42 deletions
0
functions/scrapers/__init__.py
Normal file
0
functions/scrapers/__init__.py
Normal file
38
functions/scrapers/google.py
Normal file
38
functions/scrapers/google.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
from bs4 import BeautifulSoup
|
||||
from requests import get
|
||||
from urllib.parse import urlencode
|
||||
|
||||
|
||||
def google_search(query):
|
||||
"""
|
||||
Function to get Google search results
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
|
||||
}
|
||||
|
||||
query = urlencode({"q": query})
|
||||
|
||||
# Get 20 results in case some of them are None
|
||||
resp = get("https://www.google.com/search?{}&num=20&hl=en".format(query), headers=headers)
|
||||
|
||||
if resp.status_code != 200:
|
||||
return None, resp.status_code
|
||||
|
||||
bs = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
def getContent(element):
|
||||
"""
|
||||
Function to find links & titles in the HTML of a <div> element
|
||||
"""
|
||||
link = element.find("a", href=True)
|
||||
title = element.find("h3")
|
||||
|
||||
if link is None or title is None:
|
||||
return None
|
||||
|
||||
return link["href"], title.text
|
||||
|
||||
divs = bs.find_all("div", attrs={"class": "g"})
|
||||
|
||||
return list(getContent(d) for d in divs), 200
|
||||
63
functions/scrapers/sporza.py
Normal file
63
functions/scrapers/sporza.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
from requests import get
|
||||
from urllib.parse import urlencode
|
||||
|
||||
|
||||
def getMatchweek():
|
||||
"""
|
||||
Parses the current JPL matchweek out of Sporza's site
|
||||
"""
|
||||
resp = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")
|
||||
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
|
||||
bs = BeautifulSoup(resp.text, "html.parser")
|
||||
matchdays = bs.find_all("section", attrs={"class": "sc-matchdays"})
|
||||
|
||||
if len(matchdays) == 0:
|
||||
return None
|
||||
|
||||
# Table header
|
||||
header = matchdays[0]
|
||||
|
||||
# Regex to find current matchday
|
||||
r = re.compile(r"speeldag\s*\d+", flags=re.I)
|
||||
|
||||
match = r.search(str(header))
|
||||
|
||||
# Something went wrong, just ignore
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
# "Speeldag DD" -> split on space & take second
|
||||
return match[0].split(" ")[1]
|
||||
|
||||
|
||||
def getJPLMatches(week: int):
|
||||
"""
|
||||
JPL matches for a given matchweek
|
||||
"""
|
||||
current_day = get("https://api.sporza.be/web/soccer/matchdays/161733/{}".format(week))
|
||||
|
||||
# Something went wrong
|
||||
if current_day.status_code != 200:
|
||||
return None
|
||||
|
||||
return current_day.json()["groupedMatches"][0]["matches"]
|
||||
|
||||
|
||||
def getJPLTable():
|
||||
"""
|
||||
JPL table
|
||||
"""
|
||||
page_html = get("https://sporza.be/nl/categorie/voetbal/jupiler-pro-league/")
|
||||
|
||||
# Something went wrong
|
||||
if page_html.status_code != 200:
|
||||
return None
|
||||
|
||||
bs_parsed = BeautifulSoup(page_html.text, "html.parser")
|
||||
rows = bs_parsed.find(summary="algemeen klassement").find_all("tr")[1:]
|
||||
return rows
|
||||
Loading…
Add table
Add a link
Reference in a new issue