Skip to content

Commit 1edc082

Browse files
authored
Extract XML::Document from XML::Node (#15920)
Introduces the `XML::Document` type to wrap a `LibXML::Doc*`, while `XML::Node` is meant to wrap the subtree `LibXML::Node*`. The `XML::Document` inherits from `XML::Node` and acts exactly like a `XML::Node`, so it should hopefully be a transparent change (no breaking changes). The main advantage is that we now register a single finalizer instead of one for every node (only the document needed a finalizer). Sadly, it doesn't allow having a dual strong reference from a document to its nodes, and from the nodes to their document. Boehm GC still complains about cyclic finalization, despite only the document having a finalizer.
1 parent e4d6a1d commit 1edc082

File tree

5 files changed

+131
-107
lines changed

5 files changed

+131
-107
lines changed

src/xml.cr

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ module XML
5454
# Parses an XML document from *string* with *options* into an `XML::Node`.
5555
#
5656
# See `ParserOptions.default` for default options.
57-
def self.parse(string : String, options : ParserOptions = ParserOptions.default) : Node
57+
def self.parse(string : String, options : ParserOptions = ParserOptions.default) : Document
5858
raise XML::Error.new("Document is empty", 0) if string.empty?
5959
ctxt = LibXML.xmlNewParserCtxt
6060
from_ptr(ctxt) do
@@ -65,7 +65,7 @@ module XML
6565
# Parses an XML document from *io* with *options* into an `XML::Node`.
6666
#
6767
# See `ParserOptions.default` for default options.
68-
def self.parse(io : IO, options : ParserOptions = ParserOptions.default) : Node
68+
def self.parse(io : IO, options : ParserOptions = ParserOptions.default) : Document
6969
ctxt = LibXML.xmlNewParserCtxt
7070
from_ptr(ctxt) do
7171
LibXML.xmlCtxtReadIO(ctxt, ->read_callback, ->close_callback, Box(IO).box(io), nil, nil, options)
@@ -75,7 +75,7 @@ module XML
7575
# Parses an HTML document from *string* with *options* into an `XML::Node`.
7676
#
7777
# See `HTMLParserOptions.default` for default options.
78-
def self.parse_html(string : String, options : HTMLParserOptions = HTMLParserOptions.default) : Node
78+
def self.parse_html(string : String, options : HTMLParserOptions = HTMLParserOptions.default) : Document
7979
raise XML::Error.new("Document is empty", 0) if string.empty?
8080
ctxt = LibXML.htmlNewParserCtxt
8181
from_ptr(ctxt) do
@@ -86,7 +86,7 @@ module XML
8686
# Parses an HTML document from *io* with *options* into an `XML::Node`.
8787
#
8888
# See `HTMLParserOptions.default` for default options.
89-
def self.parse_html(io : IO, options : HTMLParserOptions = HTMLParserOptions.default) : Node
89+
def self.parse_html(io : IO, options : HTMLParserOptions = HTMLParserOptions.default) : Document
9090
ctxt = LibXML.htmlNewParserCtxt
9191
from_ptr(ctxt) do
9292
LibXML.htmlCtxtReadIO(ctxt, ->read_callback, ->close_callback, Box(IO).box(io), nil, "utf-8", options)
@@ -119,7 +119,7 @@ module XML
119119
{% end %}
120120
raise Error.new(LibXML.xmlGetLastError) unless doc
121121

122-
Node.new(doc, errors)
122+
Document.new(doc, errors)
123123
end
124124

125125
{% unless LibXML.has_method?(:xmlSaveSetIndentString) %}

src/xml/document.cr

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
require "weak_ref"
2+
3+
class XML::Document < XML::Node
4+
# :nodoc:
5+
#
6+
# The constructors allocate a XML::Node for a libxml node once, so we don't
7+
# finalize a document twice for example.
8+
#
9+
# We store the reference into the libxml struct (_private) for documents
10+
# because a document's XML::Node lives as long as its libxml doc. However we
11+
# can lose references to subtree XML::Node, so using _private would leave
12+
# dangling pointers. We thus keep a cache of weak references to all nodes in
13+
# the document, so we can still collect lost references, and at worst
14+
# reinstantiate a XML::Node if needed.
15+
#
16+
# NOTE: when a XML::Node is moved to another document, the XML::Node and any
17+
# instantiated descendant XML::Node shall be cleaned from the original
18+
# document's cache, and must be added to the new document's cache.
19+
protected getter cache : Hash(LibXML::Node*, WeakRef(Node))
20+
21+
# :nodoc:
22+
#
23+
# Unlinked libxml nodes, and all their descendant nodes, don't appear in the
24+
# document's tree anymore, and must be manually freed, yet we can't merely
25+
# free the libxml node in a finalizer, because it would free the whole
26+
# subtree, while we may still have live XML::Node instances.
27+
#
28+
# We keep an explicit list of unlinked libxml nodes. We can't rely on the
29+
# cache because it uses weak references and the XML::Node could be collected,
30+
# leaking the libxml node and its subtree.
31+
#
32+
# NOTE: the libxml node, along with any descendant shall be removed from the
33+
# list when relinked into a tree, be it the same document or another.
34+
protected getter unlinked_nodes : Set(LibXML::Node*)
35+
36+
# :nodoc:
37+
def self.new(doc : LibXML::Doc*, errors : Array(Error)? = nil) : Document
38+
if ptr = doc.value._private
39+
ptr.as(Document)
40+
else
41+
new(doc_: doc, errors_: errors)
42+
end
43+
end
44+
45+
# Must never be called directly, use the constructors above.
46+
private def initialize(*, doc_ : LibXML::Doc*, errors_ : Array(Error)?)
47+
@node = doc_.as(LibXML::Node*)
48+
@errors = errors_
49+
@cache = Hash(LibXML::Node*, WeakRef(Node)).new
50+
@unlinked_nodes = Set(LibXML::Node*).new
51+
@document = self
52+
doc_.value._private = self.as(Void*)
53+
end
54+
55+
# :nodoc:
56+
def finalize
57+
# free unlinked nodes and their subtrees
58+
@unlinked_nodes.each do |node|
59+
if node.value.doc == @node
60+
LibXML.xmlFreeNode(node)
61+
else
62+
# the node has been adopted into another document, don't free!
63+
end
64+
end
65+
66+
# free the doc and its subtree
67+
LibXML.xmlFreeDoc(@node.as(LibXML::Doc*))
68+
end
69+
70+
# Returns the encoding of this node's document.
71+
def encoding : String?
72+
if encoding = @node.as(LibXML::Doc*).value.encoding
73+
String.new(encoding)
74+
end
75+
end
76+
77+
# Returns the version of this node's document.
78+
def version : String?
79+
if version = @node.as(LibXML::Doc*).value.version
80+
String.new(version)
81+
end
82+
end
83+
84+
# :nodoc:
85+
def document : Document
86+
self
87+
end
88+
89+
# Returns the list of `XML::Error` found when parsing this document.
90+
# Returns `nil` if no errors were found.
91+
def errors : Array(XML::Error)?
92+
@errors unless @errors.try &.empty?
93+
end
94+
end

src/xml/namespace.cr

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
class XML::Namespace
2-
getter document : Node
2+
getter document : Document
33

44
# :nodoc:
5-
def initialize(@document : Node, @ns : LibXML::NS*)
5+
@[Deprecated]
6+
def self.new(document : Node, ns : LibXML::NS*)
7+
new(document.as(Document), ns)
8+
end
9+
10+
# :nodoc:
11+
def initialize(@document : Document, @ns : LibXML::NS*)
612
end
713

814
# See `Object#hash(hasher)`

0 commit comments

Comments
 (0)