Add extension to support metaformats (#213)

snarfed · web-flow · commit 698f2bbd6b4a · 2023-11-30T16:29:36.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file.
 - add srcset support (#209)
 - add language support (#210)
 - add extension to expose the DOM for embedded properties (#208)
+- add extension to support metaformats (#212)
 
 ## 1.1.3 - 2023-06-28
 - reduce instances where photo is implied (#135)
diff --git a/mf2py/metaformats.py b/mf2py/metaformats.py
@@ -0,0 +1,89 @@
+"""Metaformats parser.
+
+https://microformats.org/wiki/metaformats
+
+TODO:
+* explicit mf2 classes on meta tags
+  https://microformats.org/wiki/metaformats#parsing_an_element_for_properties
+"""
+from .dom_helpers import try_urljoin
+from .mf2_classes import filter_classes
+
+METAFORMAT_TO_MF2 = [
+    # in priority order, descending
+    # OGP
+    ("property", "article:author", "author"),
+    ("property", "article:published_time", "published"),
+    ("property", "article:modified_time", "updated"),
+    ("property", "og:audio", "audio"),
+    ("property", "og:description", "summary"),
+    ("property", "og:image", "photo"),
+    ("property", "og:title", "name"),
+    ("property", "og:video", "video"),
+    # Twitter
+    ("name", "twitter:title", "name"),
+    ("name", "twitter:description", "summary"),
+    ("name", "twitter:image", "photo"),
+    # HTML standard meta names
+    # https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name
+    ("name", "description", "summary"),
+]
+OGP_TYPE_TO_MF2 = {
+    "article": "h-entry",
+    "movie": "h-cite",
+    "music": "h-cite",
+    "profile": "h-card",
+}
+URL_PROPERTIES = {
+    "article:author",
+    "og:audio",
+    "og:image",
+    "og:video",
+    "twitter:image",
+}
+
+
+def parse(soup, url=None):
+    """Extracts and returns a metaformats item from a BeautifulSoup parse tree.
+
+    Args:
+      soup (bs4.BeautifulSoup): parsed HTML
+      url (str): URL of document
+
+    Returns:
+      dict: mf2 item, or None if the input is not eligible for metaformats
+    """
+    if not soup.head:
+        return None
+
+    # Is there a microformat2 root class on the html element?
+    if filter_classes(soup.get("class", []))["h"]:
+        return None
+
+    parsed = {"properties": {}}
+    props = parsed["properties"]
+
+    # Properties
+    for attr, meta, mf2 in METAFORMAT_TO_MF2:
+        if val := soup.head.find("meta", attrs={attr: meta}):
+            if content := val.get("content"):
+                if meta in URL_PROPERTIES:
+                    content = try_urljoin(url, content)
+                props.setdefault(mf2, [content])
+
+    if soup.head.title:
+        if text := soup.head.title.text:
+            props.setdefault("name", [text])
+
+    if not props:
+        # No OGP or Twitter properties
+        return None
+
+    # type from OGP or default to h-entry
+    parsed["type"] = ["h-entry"]
+    if ogp_type := soup.head.find("meta", property="og:type"):
+        if content := ogp_type.get("content"):
+            if mf2_type := OGP_TYPE_TO_MF2.get(content.split(".")[0]):
+                parsed["type"] = [mf2_type]
+
+    return parsed
diff --git a/mf2py/parser.py b/mf2py/parser.py
@@ -6,13 +6,20 @@
 from bs4 import BeautifulSoup, FeatureNotFound
 from bs4.element import Tag
 
-from . import backcompat, implied_properties, mf2_classes, parse_property, temp_fixes
+from . import (
+    backcompat,
+    implied_properties,
+    metaformats,
+    mf2_classes,
+    parse_property,
+    temp_fixes,
+)
 from .dom_helpers import get_attr, get_children, get_descendents, try_urljoin
 from .mf_helpers import unordered_list
 from .version import __version__
 
 
-def parse(doc=None, url=None, html_parser=None, expose_dom=False):
+def parse(doc=None, url=None, html_parser=None, expose_dom=False, metaformats=False):
     """
     Parse a microformats2 document or url and return a json dictionary.
 
@@ -26,10 +33,18 @@ def parse(doc=None, url=None, html_parser=None, expose_dom=False):
         options from the BeautifulSoup documentation are:
         "html", "xml", "html5", "lxml", "html5lib", and "html.parser"
       expose_dom (boolean): optional, expose the DOM of embedded properties.
+      metaformats (boolean): whether to include metaformats extracted from OGP
+        and Twitter card data: https://microformats.org/wiki/metaformats
 
     Return: a json dict represented the structured data in this document.
     """
-    return Parser(doc, url, html_parser, expose_dom).to_dict()
+    return Parser(
+        doc,
+        url,
+        html_parser,
+        expose_dom=expose_dom,
+        metaformats=metaformats,
+    ).to_dict()
 
 
 class Parser(object):
@@ -47,6 +62,8 @@ class Parser(object):
         "html", "xml", "html5", "lxml", "html5lib", and "html.parser"
         defaults to "html5lib"
       expose_dom (boolean): optional, expose the DOM of embedded properties.
+      metaformats (boolean): whether to include metaformats extracted from OGP
+        and Twitter card data: https://microformats.org/wiki/metaformats
 
     Attributes:
       useragent (string): the User-Agent string for the Parser
@@ -56,7 +73,14 @@ class Parser(object):
     ua_url = "https://github.com/microformats/mf2py"
     useragent = "{0} - version {1} - {2}".format(ua_desc, __version__, ua_url)
 
-    def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False):
+    def __init__(
+        self,
+        doc=None,
+        url=None,
+        html_parser=None,
+        expose_dom=False,
+        metaformats=False,
+    ):
         self.__url__ = None
         self.__doc__ = None
         self._preserve_doc = False
@@ -70,6 +94,7 @@ def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False):
                 "version": __version__,
             },
         }
+        self.__metaformats = metaformats
         self.expose_dom = expose_dom
         self.lang = None
 
@@ -487,9 +512,16 @@ def parse_el(el, ctx):
                         parse_el(child, ctx)
 
         ctx = []
+
+        if self.__metaformats:
+            # extract out a metaformats item, if available
+            self.__metaformats_item = metaformats.parse(self.__doc__, url=self.__url__)
+
         # start parsing at root element of the document
         parse_el(self.__doc__, ctx)
         self.__parsed__["items"] = ctx
+        if self.__metaformats and self.__metaformats_item:
+            self.__parsed__["items"].append(self.__metaformats_item)
 
         # parse for rel values
         for el in get_descendents(self.__doc__):
diff --git a/test/examples/metaformats_html_meta.html b/test/examples/metaformats_html_meta.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Hello World</title>
+  <base href="http://tantek.com/" />
+  <meta name="description" content="Descrypshun bar" />
+</head>
+<body>
+  <p>Hello world!</p>
+</body>
+</html>
diff --git a/test/examples/metaformats_ogp.html b/test/examples/metaformats_ogp.html
@@ -0,0 +1,20 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Hello World</title>
+  <base href="http://tantek.com/" />
+  <meta property="og:type" content="article" />
+  <meta property="og:title" content="Titull foo" />
+  <meta property="og:description" content="Descrypshun bar" />
+  <meta property="og:image" content="http://example.com/baz.jpg" />
+  <meta property="og:audio" content="http://example.com/biff.mp3" />
+  <meta property="og:video" content="http://example.com/boff.mov" />
+  <meta property="article:author" content="/me" />
+  <meta property="article:published_time" content="2023-01-02T03:04Z" />
+  <meta property="article:modified_time" content="2023-01-02T05:06Z" />
+</head>
+<body>
+  <p>Hello world!</p>
+</body>
+</html>
diff --git a/test/examples/metaformats_twitter.html b/test/examples/metaformats_twitter.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Hello World</title>
+  <base href="http://tantek.com/" />
+  <meta name="twitter:title" content="Titull foo" />
+  <meta name="twitter:description" content="Descrypshun bar" />
+  <meta name="twitter:image" content="/baz.jpg" />
+</head>
+<body>
+  <p>Hello world!</p>
+</body>
+</html>
diff --git a/test/test_parser.py b/test/test_parser.py
@@ -1126,6 +1126,69 @@ def test_all_u_cases():
         )
 
 
+def test_metaformats_flag_false():
+    result = parse_fixture("metaformats_ogp.html")
+    assert result["items"] == []
+
+
+def test_metaformats_title_only():
+    result = parse_fixture("base.html", metaformats=True)
+    assert result["items"] == [
+        {
+            "type": ["h-entry"],
+            "properties": {
+                "name": ["Hello World"],
+            },
+        }
+    ]
+
+
+def test_metaformats_ogp():
+    result = parse_fixture("metaformats_ogp.html", metaformats=True)
+    assert result["items"] == [
+        {
+            "type": ["h-entry"],
+            "properties": {
+                "name": ["Titull foo"],
+                "summary": ["Descrypshun bar"],
+                "photo": ["http://example.com/baz.jpg"],
+                "audio": ["http://example.com/biff.mp3"],
+                "video": ["http://example.com/boff.mov"],
+                "author": ["http://tantek.com/me"],
+                "published": ["2023-01-02T03:04Z"],
+                "updated": ["2023-01-02T05:06Z"],
+            },
+        }
+    ]
+
+
+def test_metaformats_twitter():
+    result = parse_fixture("metaformats_twitter.html", metaformats=True)
+    assert result["items"] == [
+        {
+            "type": ["h-entry"],
+            "properties": {
+                "name": ["Titull foo"],
+                "summary": ["Descrypshun bar"],
+                "photo": ["http://tantek.com/baz.jpg"],
+            },
+        }
+    ]
+
+
+def test_metaformats_html_meta():
+    result = parse_fixture("metaformats_html_meta.html", metaformats=True)
+    assert result["items"] == [
+        {
+            "type": ["h-entry"],
+            "properties": {
+                "name": ["Hello World"],
+                "summary": ["Descrypshun bar"],
+            },
+        }
+    ]
+
+
 def test_language():
     result = parse_fixture("language.html")
     assert result["items"][0]["lang"] == "it"