didier/didier/data/scrapers/google.py

import http
import typing
from dataclasses import dataclass, field
from typing import Optional
from urllib.parse import unquote_plus, urlencode

from aiohttp import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag

__all__ = ["google_search", "SearchData"]


@dataclass
class SearchData:
    """Dataclass to store some data about a search query"""

    query: str
    status_code: int
    results: list[str] = field(default_factory=list)
    result_stats: str = ""

    def __post_init__(self):
        self.query = unquote_plus(self.query)


def get_result_stats(bs: BeautifulSoup) -> Optional[str]:
    """Parse the result stats

    Example result: "About 16.570.000 results (0,84 seconds)"
    """
    stats = bs.find("div", id="result-stats")
    if stats is None:
        return None

    return stats.text.removesuffix("\xa0")


def parse_result(element: Tag) -> Optional[str]:
    """Parse 1 wrapper into a link"""
    a_tag = element.find("a", href=True)
    if a_tag is None:
        return None

    url = a_tag["href"]  # type: ignore
    title = typing.cast(Tag, a_tag.find("h3"))

    if (
        url is None
        or not str(url).startswith(
            (
                "http://",
                "https://",
            )
        )
        or title is None
    ):
        return None

    text = unquote_plus(title.text)
    return f"[{text}]({url})"


def get_search_results(bs: BeautifulSoup) -> list[str]:
    """Parse the search results"""
    result_wrappers = bs.find_all("div", class_="g")

    results: list[str] = list(result for result in map(parse_result, result_wrappers) if result is not None)

    # Remove duplicates
    # (sets don't preserve the order!)
    return list(dict.fromkeys(results))


async def google_search(http_session: ClientSession, query: str):
    """Get the first 10 Google search results"""
    query = urlencode({"q": query})

    # Request 20 results in case of duplicates, bad matches, ...
    async with http_session.get(f"https://www.google.com/search?{query}&num=20&hl=en") as response:
        # Something went wrong
        if response.status != http.HTTPStatus.OK:
            return SearchData(query, response.status)

        bs = BeautifulSoup(await response.text(), "html.parser")
        result_stats = get_result_stats(bs)
        results = get_search_results(bs)

        return SearchData(query, 200, results[:10], result_stats or "")