Edm0nd/edmond/plugins/taxref.py

import random
import urllib.parse
from typing import cast, Optional

import requests

from edmond.plugin import Plugin
from edmond.plugins.shrlok import ShrlokPlugin

BASE_URL = "https://taxref.mnhn.fr/api"
IMG_FETCH_HTML = """\
<!doctype html>
<html>
  <head>
    <meta charset="UTF-8"/>
    <style>img {{ display: block; max-width: 95%; }}</style>
  </head>
  <body></body>
  <script>
    const urls = [{}];
    urls.forEach(url => {{
      fetch(url)
        .then(r => r.blob())
        .then(blob => {{
          let img = document.createElement("img");
          img.src = window.URL.createObjectURL(blob);
          document.body.appendChild(img);
        }});
    }});
  </script>
</html>
"""


class TaxrefPlugin(Plugin):

    REQUIRED_CONFIGS = [
        "commands",
        "not_found_reply",
        "reply",
        "ambiguous_reply",
        "unnamed_species",
    ]

    def __init__(self, bot):
        super().__init__(bot)

    def on_pubmsg(self, event):
        if not self.should_handle_command(event.arguments[0]):
            return False

        # "taxref"
        if self.command.ident == self.config["commands"][0]:
            self.search_by_name(self.command.content, event.target)
        # "scientifize"
        if self.command.ident == self.config["commands"][1]:
            self.find_scientific_name(self.command.content, event.target)
        return True

    def search_by_name(self, name: str, target: str) -> None:
        """Get species data from a scientific name.

        Try to disambiguate the results by focusing on species only and their
        scientific name.
        """
        name = name.lower()
        enc_name = urllib.parse.quote(name)
        url = (
            f"{BASE_URL}/taxa/search?scientificNames={enc_name}"
            "&page=1&size=100"
        )
        response = requests.get(url)
        if response.status_code != 200:
            self.signal_failure(target)
            return
        try:
            data = response.json()
        except ValueError:
            self.signal_failure(target)
            return

        items = data.get("_embedded", {}).get("taxa", [])

        if not items:
            self.bot.say(target, self.config["not_found_reply"])
            return

        if len(items) == 1:
            # Only one result: use it.
            item_to_use = items[0]
        else:
            # More than one result: if the results contain a corresponding
            # species, use it, else return names for sub-species etc.
            species_items = []
            for item in items:
                if item["rankId"] == "ES":
                    species_items.append(item)
            num_species = len(species_items)
            self.bot.log_d(f"{num_species} species.")
            if num_species == 1:
                item_to_use = species_items[0]
            else:
                # If there are several species, check if one of them has the
                # exact same name; else show an ambiguous reply.
                species_with_same_name = [
                    item
                    for item in species_items
                    if item["scientificName"].lower() == name
                ]
                if len(species_with_same_name) != 1:
                    reply = self.get_ambiguous_reply(species_items)
                    self.bot.say(target, reply)
                    return
                item_to_use = species_with_same_name[0]

        unnamed = self.config["unnamed_species"]
        reply = self.config["reply"].format(
            sci_name=item_to_use["scientificName"],
            fr_name=item_to_use["frenchVernacularName"] or unnamed,
            family=item_to_use["familyName"],
            cd_nom=item_to_use["id"],
            cd_ref=item_to_use["referenceId"],
        )
        self.bot.say(target, reply)

        if images_reply := self.get_images_reply(item_to_use):
            self.bot.say(target, images_reply)

    def get_ambiguous_reply(self, items) -> str:
        """Show a reply with potential species."""
        reply = self.config["ambiguous_reply"]
        append = ""
        if len(items) > 5:
            append = f"… (+{len(items)})"
            items = items[:5]
        reply += ", ".join(item["scientificName"] for item in items)
        if append:
            reply += append
        return reply

    def get_images_reply(self, item) -> Optional[str]:
        """If there are media available, return one in a message.

        If shrlok is available, return a link to an HTML page shared by shrlok.
        The HTML page, whose source code is generated from the template
        IMG_FETCH_HTML, fetches a random sample of 1 to 10 images from the
        results and embed the images directly into the page so it is not
        necessary to download the images before seeing them.

        If shrlok is not available, return a string with an URL to an image if
        one is available, or None if no image could be found or we encountered
        an error. The image is selected randomly. Yes, media links on TAXREF
        are downloaded by the browser and not shown directly, thus the benefits
        of having shrlok available.
        """
        m_url = item.get("_links", {}).get("media", {}).get("href")
        if not m_url:
            self.bot.log_d("No media links.")
            return None
        response = requests.get(m_url)
        if (code := response.status_code) != 200:
            self.bot.log_d(f"Failed to reach media link ({code}).")
            return None
        media_data = response.json()
        items = media_data.get("_embedded", {}).get("media", [])
        if not items:
            self.bot.log_d("No media found in response.")
            return None

        def get_img_url(item) -> Optional[str]:
            return item.get("_links", {}).get("file", {}).get("href")

        if shrlok := cast(ShrlokPlugin, self.bot.get_plugin("shrlok")):
            if len(items) > 10:
                items = random.sample(items, 10)
            urls = map(get_img_url, items)
            urls_text = ",".join(map(lambda url: f'"{url}"', urls))
            html = IMG_FETCH_HTML.format(urls_text).encode()
            link = shrlok.post({"type": "raw", "ext": "html"}, html)
            if not link:
                self.bot.log_d("shrlok plugin returned an empty string.")
        else:
            link = get_img_url(random.choice(items))
            if not link:
                self.bot.log_d("No link found.")

        if link:
            return "📷 " + link
        return None

    def find_scientific_name(self, name: str, target: str):
        """Find a corresponding scientific name for a vernacular name."""
        name = name.lower()
        enc_name = urllib.parse.quote(name)
        url = (
            f"{BASE_URL}/taxa/search?frenchVernacularNames={enc_name}"
            "&page=1&size=100"
        )
        response = requests.get(url)
        if response.status_code != 200:
            self.signal_failure(target)
            return
        try:
            data = response.json()
        except ValueError:
            self.signal_failure(target)
            return
        items = data.get("_embedded", {}).get("taxa", [])

        if not items:
            self.bot.say(target, self.config["not_found_reply"])
            return

        if len(items) == 1:
            # Only one result: use it.
            reply = TaxrefPlugin.item_to_full_name(items[0])
        else:
            # More than one result? For simplicity sake, use the shrlok plugin
            # if available or just show an ambiguous response.
            if shrlok := cast(ShrlokPlugin, self.bot.get_plugin("shrlok")):
                text = (
                    "\n".join(
                        (
                            item["frenchVernacularName"]
                            + " → "
                            + TaxrefPlugin.item_to_full_name(item)
                        )
                        for item in items
                    )
                    + "\n"
                )
                reply = shrlok.post({"type": "txt"}, text.encode())
                if not reply:
                    self.bot.log_d("shrlok plugin returned an empty string.")
                    return
            else:
                reply = self.get_ambiguous_reply(items)

        self.bot.say(target, reply)

    @staticmethod
    def item_to_full_name(item: dict) -> str:
        family_name = item.get("familyName")
        sci_name = item.get("scientificName")
        return f"{family_name} {sci_name}"