6
6
from bs4 import BeautifulSoup , FeatureNotFound
7
7
from bs4 .element import Tag
8
8
9
- from . import backcompat , implied_properties , mf2_classes , parse_property , temp_fixes
9
+ from . import (
10
+ backcompat ,
11
+ implied_properties ,
12
+ metaformats ,
13
+ mf2_classes ,
14
+ parse_property ,
15
+ temp_fixes ,
16
+ )
10
17
from .dom_helpers import get_attr , get_children , get_descendents , try_urljoin
11
18
from .mf_helpers import unordered_list
12
19
from .version import __version__
13
20
14
21
15
- def parse (doc = None , url = None , html_parser = None , expose_dom = False ):
22
+ def parse (doc = None , url = None , html_parser = None , expose_dom = False , metaformats = False ):
16
23
"""
17
24
Parse a microformats2 document or url and return a json dictionary.
18
25
@@ -26,10 +33,18 @@ def parse(doc=None, url=None, html_parser=None, expose_dom=False):
26
33
options from the BeautifulSoup documentation are:
27
34
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
28
35
expose_dom (boolean): optional, expose the DOM of embedded properties.
36
+ metaformats (boolean): whether to include metaformats extracted from OGP
37
+ and Twitter card data: https://microformats.org/wiki/metaformats
29
38
30
39
Return: a json dict represented the structured data in this document.
31
40
"""
32
- return Parser (doc , url , html_parser , expose_dom ).to_dict ()
41
+ return Parser (
42
+ doc ,
43
+ url ,
44
+ html_parser ,
45
+ expose_dom = expose_dom ,
46
+ metaformats = metaformats ,
47
+ ).to_dict ()
33
48
34
49
35
50
class Parser (object ):
@@ -47,6 +62,8 @@ class Parser(object):
47
62
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
48
63
defaults to "html5lib"
49
64
expose_dom (boolean): optional, expose the DOM of embedded properties.
65
+ metaformats (boolean): whether to include metaformats extracted from OGP
66
+ and Twitter card data: https://microformats.org/wiki/metaformats
50
67
51
68
Attributes:
52
69
useragent (string): the User-Agent string for the Parser
@@ -56,7 +73,14 @@ class Parser(object):
56
73
ua_url = "https://github.com/microformats/mf2py"
57
74
useragent = "{0} - version {1} - {2}" .format (ua_desc , __version__ , ua_url )
58
75
59
- def __init__ (self , doc = None , url = None , html_parser = None , expose_dom = False ):
76
+ def __init__ (
77
+ self ,
78
+ doc = None ,
79
+ url = None ,
80
+ html_parser = None ,
81
+ expose_dom = False ,
82
+ metaformats = False ,
83
+ ):
60
84
self .__url__ = None
61
85
self .__doc__ = None
62
86
self ._preserve_doc = False
@@ -70,6 +94,7 @@ def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False):
70
94
"version" : __version__ ,
71
95
},
72
96
}
97
+ self .__metaformats = metaformats
73
98
self .expose_dom = expose_dom
74
99
self .lang = None
75
100
@@ -487,9 +512,16 @@ def parse_el(el, ctx):
487
512
parse_el (child , ctx )
488
513
489
514
ctx = []
515
+
516
+ if self .__metaformats :
517
+ # extract out a metaformats item, if available
518
+ self .__metaformats_item = metaformats .parse (self .__doc__ , url = self .__url__ )
519
+
490
520
# start parsing at root element of the document
491
521
parse_el (self .__doc__ , ctx )
492
522
self .__parsed__ ["items" ] = ctx
523
+ if self .__metaformats and self .__metaformats_item :
524
+ self .__parsed__ ["items" ].append (self .__metaformats_item )
493
525
494
526
# parse for rel values
495
527
for el in get_descendents (self .__doc__ ):
0 commit comments