From 0c810d84e9906489c5baf8a7f79f4145ef0147d4 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Mon, 18 Jul 2022 23:22:28 +0200 Subject: [PATCH] Google search --- didier/cogs/other.py | 17 ++++- didier/data/embeds/google/__init__.py | 3 + didier/data/embeds/google/google_search.py | 48 ++++++++++++ didier/data/scrapers/__init__.py | 0 didier/data/scrapers/google.py | 85 ++++++++++++++++++++++ requirements.txt | 1 + 6 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 didier/data/embeds/google/__init__.py create mode 100644 didier/data/embeds/google/google_search.py create mode 100644 didier/data/scrapers/__init__.py create mode 100644 didier/data/scrapers/google.py diff --git a/didier/cogs/other.py b/didier/cogs/other.py index 6a036e2..04175ed 100644 --- a/didier/cogs/other.py +++ b/didier/cogs/other.py @@ -1,7 +1,10 @@ +from discord import app_commands from discord.ext import commands from didier import Didier from didier.data.apis import urban_dictionary +from didier.data.embeds.google import GoogleSearch +from didier.data.scrapers import google class Other(commands.Cog): @@ -15,8 +18,18 @@ class Other(commands.Cog): @commands.hybrid_command(name="define", description="Urban Dictionary", aliases=["Ud", "Urban"], usage="[Woord]") async def define(self, ctx: commands.Context, *, query: str): """Look up the definition of a word on the Urban Dictionary""" - definitions = await urban_dictionary.lookup(self.client.http_session, query) - await ctx.reply(embed=definitions[0].to_embed(), mention_author=False) + async with ctx.typing(): + definitions = await urban_dictionary.lookup(self.client.http_session, query) + await ctx.reply(embed=definitions[0].to_embed(), mention_author=False) + + @commands.hybrid_command(name="google", description="Google search", usage="[Query]") + @app_commands.describe(query="Search query") + async def google(self, ctx: commands.Context, *, query: str): + """Google something""" + async with ctx.typing(): + results = await google.google_search(self.client.http_session, query) + embed = GoogleSearch(results).to_embed() + await ctx.reply(embed=embed, mention_author=False) async def setup(client: Didier): diff --git a/didier/data/embeds/google/__init__.py b/didier/data/embeds/google/__init__.py new file mode 100644 index 0000000..bc57985 --- /dev/null +++ b/didier/data/embeds/google/__init__.py @@ -0,0 +1,3 @@ +from .google_search import GoogleSearch + +__all__ = ["GoogleSearch"] diff --git a/didier/data/embeds/google/google_search.py b/didier/data/embeds/google/google_search.py new file mode 100644 index 0000000..605a5fa --- /dev/null +++ b/didier/data/embeds/google/google_search.py @@ -0,0 +1,48 @@ +from http import HTTPStatus + +import discord + +from didier.data.embeds.base import EmbedBaseModel +from didier.data.scrapers.google import SearchData + +__all__ = ["GoogleSearch"] + + +class GoogleSearch(EmbedBaseModel): + """Embed to display Google search results""" + + data: SearchData + + def __init__(self, data: SearchData): + self.data = data + + def _error_embed(self) -> discord.Embed: + """Custom embed for unsuccessful requests""" + embed = discord.Embed(colour=discord.Colour.red()) + embed.set_author(name="Google Search") + + # Empty embed + if not self.data.results: + embed.description = "Geen resultaten gevonden" + return embed + + # Error embed + embed.description = f"Status {self.data.status_code}" + + return embed + + def to_embed(self) -> discord.Embed: + if not self.data.results or self.data.status_code != HTTPStatus.OK: + return self._error_embed() + + embed = discord.Embed(colour=discord.Colour.blue()) + embed.set_author(name="Google Search") + embed.set_footer(text=self.data.result_stats or None) + + # Add all results into the description + results = [] + for index, url in enumerate(self.data.results): + results.append(f"{index + 1}: {url}") + + embed.description = "\n".join(results) + return embed diff --git a/didier/data/scrapers/__init__.py b/didier/data/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/didier/data/scrapers/google.py b/didier/data/scrapers/google.py new file mode 100644 index 0000000..d9a34c2 --- /dev/null +++ b/didier/data/scrapers/google.py @@ -0,0 +1,85 @@ +import http +from dataclasses import dataclass, field +from typing import Optional +from urllib.parse import unquote_plus, urlencode + +from aiohttp import ClientSession +from bs4 import BeautifulSoup +from bs4.element import Tag + +__all__ = ["google_search", "SearchData"] + + +@dataclass +class SearchData: + """Dataclass to store some data about a search query""" + + query: str + status_code: int + results: list[str] = field(default_factory=list) + result_stats: str = "" + + def __post_init__(self): + self.query = unquote_plus(self.query) + + +def get_result_stats(bs: BeautifulSoup) -> Optional[str]: + """Parse the result stats + + Example result: "About 16.570.000 results (0,84 seconds)" + """ + stats = bs.find("div", id="result-stats").text + return stats and stats.removesuffix("\xa0") + + +def parse_result(element: Tag) -> Optional[str]: + """Parse 1 wrapper into a link""" + a_tag = element.find("a", href=True) + url = a_tag["href"] + title = a_tag.find("h3") + + if ( + url is None + or not url.startswith( + ( + "http://", + "https://", + ) + ) + or title is None + ): + return None + + text = unquote_plus(title.text) + return f"[{text}]({url})" + + +def get_search_results(bs: BeautifulSoup) -> list[str]: + """Parse the search results""" + result_wrappers = bs.find_all("div", class_="g") + results = filter(lambda x: x is not None, map(parse_result, result_wrappers)) + + # Remove duplicates + # (sets don't preserve the order!) + return list(dict.fromkeys(results)) + + +async def google_search(http_client: ClientSession, query: str): + """Get the first 10 Google search results""" + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36" + } + + query = urlencode({"q": query}) + + # Request 20 results in case of duplicates, bad matches, ... + async with http_client.get(f"https://www.google.com/search?{query}&num=20&hl=en", headers=headers) as response: + # Something went wrong + if response.status != http.HTTPStatus.OK: + return SearchData(query, response.status) + + bs = BeautifulSoup(await response.text(), "html.parser") + result_stats = get_result_stats(bs) + results = get_search_results(bs) + + return SearchData(query, 200, results[:10], result_stats) diff --git a/requirements.txt b/requirements.txt index 0737e1f..285b936 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ aiohttp==3.8.1 alembic==1.8.0 asyncpg==0.25.0 +beautifulsoup4==4.11.1 # Dev version of dpy git+https://github.com/Rapptz/discord.py environs==9.5.0