Skip to content

Commit 698f2bb

Browse files
authored
Add extension to support metaformats (#213)
1 parent 043c2f6 commit 698f2bb

File tree

7 files changed

+235
-4
lines changed

7 files changed

+235
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file.
77
- add srcset support (#209)
88
- add language support (#210)
99
- add extension to expose the DOM for embedded properties (#208)
10+
- add extension to support metaformats (#212)
1011

1112
## 1.1.3 - 2023-06-28
1213
- reduce instances where photo is implied (#135)

mf2py/metaformats.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""Metaformats parser.
2+
3+
https://microformats.org/wiki/metaformats
4+
5+
TODO:
6+
* explicit mf2 classes on meta tags
7+
https://microformats.org/wiki/metaformats#parsing_an_element_for_properties
8+
"""
9+
from .dom_helpers import try_urljoin
10+
from .mf2_classes import filter_classes
11+
12+
METAFORMAT_TO_MF2 = [
13+
# in priority order, descending
14+
# OGP
15+
("property", "article:author", "author"),
16+
("property", "article:published_time", "published"),
17+
("property", "article:modified_time", "updated"),
18+
("property", "og:audio", "audio"),
19+
("property", "og:description", "summary"),
20+
("property", "og:image", "photo"),
21+
("property", "og:title", "name"),
22+
("property", "og:video", "video"),
23+
# Twitter
24+
("name", "twitter:title", "name"),
25+
("name", "twitter:description", "summary"),
26+
("name", "twitter:image", "photo"),
27+
# HTML standard meta names
28+
# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name
29+
("name", "description", "summary"),
30+
]
31+
OGP_TYPE_TO_MF2 = {
32+
"article": "h-entry",
33+
"movie": "h-cite",
34+
"music": "h-cite",
35+
"profile": "h-card",
36+
}
37+
URL_PROPERTIES = {
38+
"article:author",
39+
"og:audio",
40+
"og:image",
41+
"og:video",
42+
"twitter:image",
43+
}
44+
45+
46+
def parse(soup, url=None):
47+
"""Extracts and returns a metaformats item from a BeautifulSoup parse tree.
48+
49+
Args:
50+
soup (bs4.BeautifulSoup): parsed HTML
51+
url (str): URL of document
52+
53+
Returns:
54+
dict: mf2 item, or None if the input is not eligible for metaformats
55+
"""
56+
if not soup.head:
57+
return None
58+
59+
# Is there a microformat2 root class on the html element?
60+
if filter_classes(soup.get("class", []))["h"]:
61+
return None
62+
63+
parsed = {"properties": {}}
64+
props = parsed["properties"]
65+
66+
# Properties
67+
for attr, meta, mf2 in METAFORMAT_TO_MF2:
68+
if val := soup.head.find("meta", attrs={attr: meta}):
69+
if content := val.get("content"):
70+
if meta in URL_PROPERTIES:
71+
content = try_urljoin(url, content)
72+
props.setdefault(mf2, [content])
73+
74+
if soup.head.title:
75+
if text := soup.head.title.text:
76+
props.setdefault("name", [text])
77+
78+
if not props:
79+
# No OGP or Twitter properties
80+
return None
81+
82+
# type from OGP or default to h-entry
83+
parsed["type"] = ["h-entry"]
84+
if ogp_type := soup.head.find("meta", property="og:type"):
85+
if content := ogp_type.get("content"):
86+
if mf2_type := OGP_TYPE_TO_MF2.get(content.split(".")[0]):
87+
parsed["type"] = [mf2_type]
88+
89+
return parsed

mf2py/parser.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,20 @@
66
from bs4 import BeautifulSoup, FeatureNotFound
77
from bs4.element import Tag
88

9-
from . import backcompat, implied_properties, mf2_classes, parse_property, temp_fixes
9+
from . import (
10+
backcompat,
11+
implied_properties,
12+
metaformats,
13+
mf2_classes,
14+
parse_property,
15+
temp_fixes,
16+
)
1017
from .dom_helpers import get_attr, get_children, get_descendents, try_urljoin
1118
from .mf_helpers import unordered_list
1219
from .version import __version__
1320

1421

15-
def parse(doc=None, url=None, html_parser=None, expose_dom=False):
22+
def parse(doc=None, url=None, html_parser=None, expose_dom=False, metaformats=False):
1623
"""
1724
Parse a microformats2 document or url and return a json dictionary.
1825
@@ -26,10 +33,18 @@ def parse(doc=None, url=None, html_parser=None, expose_dom=False):
2633
options from the BeautifulSoup documentation are:
2734
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
2835
expose_dom (boolean): optional, expose the DOM of embedded properties.
36+
metaformats (boolean): whether to include metaformats extracted from OGP
37+
and Twitter card data: https://microformats.org/wiki/metaformats
2938
3039
Return: a json dict represented the structured data in this document.
3140
"""
32-
return Parser(doc, url, html_parser, expose_dom).to_dict()
41+
return Parser(
42+
doc,
43+
url,
44+
html_parser,
45+
expose_dom=expose_dom,
46+
metaformats=metaformats,
47+
).to_dict()
3348

3449

3550
class Parser(object):
@@ -47,6 +62,8 @@ class Parser(object):
4762
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
4863
defaults to "html5lib"
4964
expose_dom (boolean): optional, expose the DOM of embedded properties.
65+
metaformats (boolean): whether to include metaformats extracted from OGP
66+
and Twitter card data: https://microformats.org/wiki/metaformats
5067
5168
Attributes:
5269
useragent (string): the User-Agent string for the Parser
@@ -56,7 +73,14 @@ class Parser(object):
5673
ua_url = "https://github.com/microformats/mf2py"
5774
useragent = "{0} - version {1} - {2}".format(ua_desc, __version__, ua_url)
5875

59-
def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False):
76+
def __init__(
77+
self,
78+
doc=None,
79+
url=None,
80+
html_parser=None,
81+
expose_dom=False,
82+
metaformats=False,
83+
):
6084
self.__url__ = None
6185
self.__doc__ = None
6286
self._preserve_doc = False
@@ -70,6 +94,7 @@ def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False):
7094
"version": __version__,
7195
},
7296
}
97+
self.__metaformats = metaformats
7398
self.expose_dom = expose_dom
7499
self.lang = None
75100

@@ -487,9 +512,16 @@ def parse_el(el, ctx):
487512
parse_el(child, ctx)
488513

489514
ctx = []
515+
516+
if self.__metaformats:
517+
# extract out a metaformats item, if available
518+
self.__metaformats_item = metaformats.parse(self.__doc__, url=self.__url__)
519+
490520
# start parsing at root element of the document
491521
parse_el(self.__doc__, ctx)
492522
self.__parsed__["items"] = ctx
523+
if self.__metaformats and self.__metaformats_item:
524+
self.__parsed__["items"].append(self.__metaformats_item)
493525

494526
# parse for rel values
495527
for el in get_descendents(self.__doc__):
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<meta http-equiv="content-type" content="text/html; charset=utf-8">
5+
<title>Hello World</title>
6+
<base href="http://tantek.com/" />
7+
<meta name="description" content="Descrypshun bar" />
8+
</head>
9+
<body>
10+
<p>Hello world!</p>
11+
</body>
12+
</html>

test/examples/metaformats_ogp.html

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<meta http-equiv="content-type" content="text/html; charset=utf-8">
5+
<title>Hello World</title>
6+
<base href="http://tantek.com/" />
7+
<meta property="og:type" content="article" />
8+
<meta property="og:title" content="Titull foo" />
9+
<meta property="og:description" content="Descrypshun bar" />
10+
<meta property="og:image" content="http://example.com/baz.jpg" />
11+
<meta property="og:audio" content="http://example.com/biff.mp3" />
12+
<meta property="og:video" content="http://example.com/boff.mov" />
13+
<meta property="article:author" content="/me" />
14+
<meta property="article:published_time" content="2023-01-02T03:04Z" />
15+
<meta property="article:modified_time" content="2023-01-02T05:06Z" />
16+
</head>
17+
<body>
18+
<p>Hello world!</p>
19+
</body>
20+
</html>
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<meta http-equiv="content-type" content="text/html; charset=utf-8">
5+
<title>Hello World</title>
6+
<base href="http://tantek.com/" />
7+
<meta name="twitter:title" content="Titull foo" />
8+
<meta name="twitter:description" content="Descrypshun bar" />
9+
<meta name="twitter:image" content="/baz.jpg" />
10+
</head>
11+
<body>
12+
<p>Hello world!</p>
13+
</body>
14+
</html>

test/test_parser.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,69 @@ def test_all_u_cases():
11261126
)
11271127

11281128

1129+
def test_metaformats_flag_false():
1130+
result = parse_fixture("metaformats_ogp.html")
1131+
assert result["items"] == []
1132+
1133+
1134+
def test_metaformats_title_only():
1135+
result = parse_fixture("base.html", metaformats=True)
1136+
assert result["items"] == [
1137+
{
1138+
"type": ["h-entry"],
1139+
"properties": {
1140+
"name": ["Hello World"],
1141+
},
1142+
}
1143+
]
1144+
1145+
1146+
def test_metaformats_ogp():
1147+
result = parse_fixture("metaformats_ogp.html", metaformats=True)
1148+
assert result["items"] == [
1149+
{
1150+
"type": ["h-entry"],
1151+
"properties": {
1152+
"name": ["Titull foo"],
1153+
"summary": ["Descrypshun bar"],
1154+
"photo": ["http://example.com/baz.jpg"],
1155+
"audio": ["http://example.com/biff.mp3"],
1156+
"video": ["http://example.com/boff.mov"],
1157+
"author": ["http://tantek.com/me"],
1158+
"published": ["2023-01-02T03:04Z"],
1159+
"updated": ["2023-01-02T05:06Z"],
1160+
},
1161+
}
1162+
]
1163+
1164+
1165+
def test_metaformats_twitter():
1166+
result = parse_fixture("metaformats_twitter.html", metaformats=True)
1167+
assert result["items"] == [
1168+
{
1169+
"type": ["h-entry"],
1170+
"properties": {
1171+
"name": ["Titull foo"],
1172+
"summary": ["Descrypshun bar"],
1173+
"photo": ["http://tantek.com/baz.jpg"],
1174+
},
1175+
}
1176+
]
1177+
1178+
1179+
def test_metaformats_html_meta():
1180+
result = parse_fixture("metaformats_html_meta.html", metaformats=True)
1181+
assert result["items"] == [
1182+
{
1183+
"type": ["h-entry"],
1184+
"properties": {
1185+
"name": ["Hello World"],
1186+
"summary": ["Descrypshun bar"],
1187+
},
1188+
}
1189+
]
1190+
1191+
11291192
def test_language():
11301193
result = parse_fixture("language.html")
11311194
assert result["items"][0]["lang"] == "it"

0 commit comments

Comments
 (0)