Skip to content

Commit

Permalink
Extend XML::Reader with more LibXML methods (#5740)
Browse files Browse the repository at this point in the history
* Extend XML::Reader with more LibXML methods

This adds a couple of method bindings that come in handy when doing
pull parsing or hybrid parsing (search with pull then expand node).

* Work around incomplete xmlTextReaderNextSibling()

The current implementation of xmlTextReaderNextSibling() only works on
preparsed documents, so we need to detect the error returned if the
reader is not using a preparsed document and implement our own
next sibling by looking at reader internals.

* Fix XML::Reader#name/#value when not on node

This avoids segfaults when those methods are called before the first or
after the last read.

* Add XML::Type::NONE for XML::Reader#node_type

This fixes a problem where XML::Reader#node_type would return zero
before the first or after the last read, which previously had no
mapping in the XML::Type enum, so the value couldn't be checked.

* Document all XML::Reader methods

* Add specs for all XML::Reader methods

* Avoid Nil for XML::Reader string getters

instead return an empty string if the methods are called in an invalid
reader state (before the first or after the last read).

A special case is the #value method, which could also return nil if
called on a node without a text value, like `<tag>`, but here an empty
string also makes sense.

* Use explicit type for XML:Reader attribute methods

* Rename XML::Reader#attribute to #[]/#[]?

and implement behavior similar to XML::Node.
felixbuenemann authored and RX14 committed Jun 29, 2018
1 parent 89b3867 commit 3696bb1
Showing 4 changed files with 551 additions and 2 deletions.
453 changes: 453 additions & 0 deletions spec/std/xml/reader_spec.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,453 @@
require "spec"
require "xml"

private def xml
<<-XML
<?xml version="1.0" encoding="UTF-8"?>
<people>
<person id="1">
<name>John</name>
</person>
<person id="2">
<name>Peter</name>
</person>
</people>
XML
end

module XML
describe Reader do
describe ".new" do
it "can be initialized from a string" do
reader = Reader.new(xml)
reader.should be_a(XML::Reader)
reader.read.should be_true
reader.name.should eq("people")
end

it "can be initialize from an io" do
io = IO::Memory.new(xml)
reader = Reader.new(io)
reader.should be_a(XML::Reader)
reader.read.should be_true
reader.name.should eq("people")
end
end

describe "#read" do
it "reads all nodes" do
reader = Reader.new(xml)
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("people")
reader.read.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("person")
reader["id"].should eq("1")
reader.read.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("name")
reader.read.should be_true
reader.node_type.should eq(XML::Type::TEXT_NODE)
reader.name.should eq("#text")
reader.value.should eq("John")
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_DECL)
reader.name.should eq("name")
reader.read.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_DECL)
reader.name.should eq("person")
reader.read.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("person")
reader["id"].should eq("2")
reader.read.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("name")
reader.read.should be_true
reader.node_type.should eq(XML::Type::TEXT_NODE)
reader.name.should eq("#text")
reader.value.should eq("Peter")
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_DECL)
reader.name.should eq("name")
reader.read.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_DECL)
reader.name.should eq("person")
reader.read.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.read.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_DECL)
reader.name.should eq("people")
reader.read.should be_false
end
end

describe "#next" do
it "reads next node in doc order, skipping subtrees" do
reader = Reader.new(xml)
while reader.read
break if reader.depth == 2
end
reader.next.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("name")
reader.next.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.next.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_DECL)
reader.name.should eq("person")
reader["id"].should eq("1")
reader.next.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.next.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("person")
reader["id"].should eq("2")
reader.next.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.next.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_DECL)
reader.name.should eq("people")
reader.next.should be_false
end
end

describe "#next_sibling" do
it "reads next sibling node in doc order, skipping subtrees" do
reader = Reader.new(xml)
while reader.read
break if reader.depth == 1
end
reader.next_sibling.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("person")
reader["id"].should eq("1")
reader.next_sibling.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.next_sibling.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("person")
reader["id"].should eq("2")
reader.next_sibling.should be_true
reader.node_type.should eq(XML::Type::DTD_NODE)
reader.name.should eq("#text")
reader.next_sibling.should be_false
end
end

describe "#node_type" do
it "returns the node type" do
reader = Reader.new("<root/>")
reader.node_type.should eq(XML::Type::NONE)
reader.read
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
end
end

describe "#name" do
it "reads node name" do
reader = Reader.new("<root/>")
reader.name.should eq("")
reader.read
reader.name.should eq("root")
end
end

describe "#empty_element?" do
it "checks if the node is empty" do
reader = Reader.new("<root/>")
reader.empty_element?.should be_false
reader.read
reader.empty_element?.should be_true
reader = Reader.new("<root></root>")
reader.read
reader.empty_element?.should be_false
end
end

describe "#has_attributes?" do
it "checks if the node has attributes" do
reader = Reader.new(%{<root id="1"><child/></root>})
reader.has_attributes?.should be_false
reader.read # <root id="1">
reader.has_attributes?.should be_true
reader.read # <child/>
reader.has_attributes?.should be_false
reader.read # </root>
reader.has_attributes?.should be_true
end
end

describe "#attributes_count" do
it "returns the node's number of attributes" do
reader = Reader.new(%{<root id="1"><child/></root>})
reader.attributes_count.should eq(0)
reader.read # <root id="1">
reader.attributes_count.should eq(1)
reader.read # <child/>
reader.attributes_count.should eq(0)
reader.read # </root>
# This is weird, since has_attributes? will be true.
reader.attributes_count.should eq(0)
end
end

describe "#move_to_first_attribute" do
it "moves to the first attribute of the node" do
reader = Reader.new(%{<root id="1"><child/></root>})
reader.move_to_first_attribute.should be_false
reader.read # <root id="1">
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.move_to_first_attribute.should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id")
reader.value.should eq("1")
reader.read # <child/>
reader.move_to_first_attribute.should be_false
reader.read # </root>
reader.move_to_first_attribute.should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id")
reader.value.should eq("1")
reader.read.should be_false
end
end

describe "#move_to_next_attribute" do
it "moves to the next attribute of the node" do
reader = Reader.new(%{<root id="1" id2="2"><child/></root>})
reader.move_to_next_attribute.should be_false
reader.read # <root id="1" id2="2">
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.move_to_next_attribute.should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id")
reader.value.should eq("1")
reader.move_to_next_attribute.should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id2")
reader.value.should eq("2")
reader.move_to_next_attribute.should be_false
reader.read # <child/>
reader.move_to_next_attribute.should be_false
reader.read # </root>
reader.move_to_next_attribute.should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id")
reader.value.should eq("1")
reader.read.should be_false
end
end

describe "#move_to_attribute" do
it "moves to attribute with the specified name" do
reader = Reader.new(%{<root id="1" id2="2"><child/></root>})
reader.move_to_attribute("id2").should be_false
reader.read # <root id="1" id2="2">
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.move_to_attribute("id2").should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id2")
reader.value.should eq("2")
reader.move_to_attribute("id").should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id")
reader.value.should eq("1")
reader.move_to_attribute("bogus").should be_false
reader.read # <child/>
reader.move_to_attribute("id2").should be_false
reader.read # </root>
reader.move_to_attribute("id2").should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id2")
reader.value.should eq("2")
reader.read.should be_false
end
end

describe "#[]" do
it "reads node attributes" do
reader = Reader.new("<root/>")
expect_raises(KeyError) { reader["id"] }
reader.read
expect_raises(KeyError) { reader["id"] }
reader = Reader.new(%{<root id="1"/>})
reader.read
reader["id"].should eq("1")
reader = Reader.new(%{<root id="1"><child/></root>})
reader.read # <root id="1">
reader["id"].should eq("1")
reader.read # <child/>
expect_raises(KeyError) { reader["id"] }
reader.read # </root>
reader["id"].should eq("1")
end
end

describe "#[]?" do
it "reads node attributes" do
reader = Reader.new("<root/>")
reader["id"]?.should be_nil
reader.read
reader["id"]?.should be_nil
reader = Reader.new(%{<root id="1"/>})
reader.read
reader["id"]?.should eq("1")
reader = Reader.new(%{<root id="1"><child/></root>})
reader.read # <root id="1">
reader["id"]?.should eq("1")
reader.read # <child/>
reader["id"]?.should be_nil
reader.read # </root>
reader["id"]?.should eq("1")
end
end

describe "#move_to_element" do
it "moves to the element node that contains the current attribute node" do
reader = Reader.new(%{<root id="1"></root>})
reader.move_to_element.should be_false
reader.read # <root id="1">
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("root")
reader.move_to_element.should be_false
reader.move_to_first_attribute.should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id")
reader.move_to_element.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_NODE)
reader.name.should eq("root")
reader.read # </root>
reader.move_to_element.should be_false
reader.move_to_first_attribute.should be_true
reader.node_type.should eq(XML::Type::ATTRIBUTE_NODE)
reader.name.should eq("id")
reader.move_to_element.should be_true
reader.node_type.should eq(XML::Type::ELEMENT_DECL)
reader.name.should eq("root")
reader.read.should be_false
end
end

describe "#depth" do
it "returns the depth of the node" do
reader = Reader.new("<root><child/></root>")
reader.depth.should eq(0)
reader.read # <root>
reader.depth.should eq(0)
reader.read # <child/>
reader.depth.should eq(1)
reader.read # </root>
reader.depth.should eq(0)
end
end

describe "#read_inner_xml" do
it "reads the contents of the node including child nodes and markup" do
reader = Reader.new("<root>\n<child/>\n</root>\n")
reader.read_inner_xml.should eq("")
reader.read # <root>
reader.read_inner_xml.should eq("\n<child/>\n")
reader.read # \n
reader.read_inner_xml.should eq("")
reader.read # <child/>
reader.read_inner_xml.should eq("")
reader.read # \n
reader.read_inner_xml.should eq("")
reader.read # </root>
reader.read_inner_xml.should eq("")
reader.read.should be_false
end
end

describe "#read_outer_xml" do
it "reads the xml of the node including child nodes and markup" do
reader = Reader.new("<root>\n<child/>\n</root>\n")
reader.read_outer_xml.should eq("")
reader.read # <root>
reader.read_outer_xml.should eq("<root>\n<child/>\n</root>")
reader.read # \n
reader.read_outer_xml.should eq("\n")
reader.read # <child/>
reader.read_outer_xml.should eq("<child/>")
reader.read # \n
reader.read_outer_xml.should eq("\n")
reader.read # </root>
# Note that the closing element is transformed into a self-closing one.
reader.read_outer_xml.should eq("<root/>")
reader.read.should be_false
end
end

describe "#expand" do
it "parses the content of the node and subtree" do
reader = Reader.new(%{<root id="1"><child/></root>})
reader.expand.should be_nil
reader.read # <root id="1">
node = reader.expand
node.should be_a(XML::Node)
node.not_nil!.attributes["id"].content.should eq("1")
node.not_nil!.xpath_node("child").should be_a(XML::Node)
end

it "is only available until the next read" do
reader = Reader.new(%{<root><child><subchild/></child></root>})
reader.read # <root>
reader.read # <child>
node = reader.expand
node.should be_a(XML::Node)
node.not_nil!.xpath_node("subchild").should be_a(XML::Node)
reader.read # <subchild/>
reader.read # </child>
node.not_nil!.xpath_node("subchild").should be_nil
end
end

describe "#value" do
it "reads node text value" do
reader = Reader.new(%{<root id="1">hello<!-- world --></root>})
reader.value.should eq("")
reader.read # <root>
reader.value.should eq("")
reader.read # hello
reader.value.should eq("hello")
reader.read # <!-- world -->
reader.value.should eq(" world ")
reader.read # </root>
reader.move_to_first_attribute.should be_true
reader.value.should eq("1")
end
end

describe "#to_unsafe" do
it "returns a pointer to the underlying LibXML::XMLTextReader" do
reader = Reader.new("<root/>")
reader.to_unsafe.should be_a(LibXML::XMLTextReader)
end
end
end
end
10 changes: 10 additions & 0 deletions src/xml/libxml2.cr
Original file line number Diff line number Diff line change
@@ -95,6 +95,8 @@ lib LibXML
fun xmlNewTextReader(input : InputBuffer, uri : UInt8*) : XMLTextReader

fun xmlTextReaderRead(reader : XMLTextReader) : Int
fun xmlTextReaderNext(reader : XMLTextReader) : Int
fun xmlTextReaderNextSibling(reader : XMLTextReader) : Int
fun xmlTextReaderNodeType(reader : XMLTextReader) : XML::Type
fun xmlTextReaderConstName(reader : XMLTextReader) : UInt8*
fun xmlTextReaderIsEmptyElement(reader : XMLTextReader) : Int
@@ -103,6 +105,14 @@ lib LibXML
fun xmlTextReaderAttributeCount(reader : XMLTextReader) : Int
fun xmlTextReaderMoveToFirstAttribute(reader : XMLTextReader) : Int
fun xmlTextReaderMoveToNextAttribute(reader : XMLTextReader) : Int
fun xmlTextReaderMoveToAttribute(reader : XMLTextReader, name : UInt8*) : Int
fun xmlTextReaderGetAttribute(reader : XMLTextReader, name : UInt8*) : UInt8*
fun xmlTextReaderMoveToElement(reader : XMLTextReader) : Int
fun xmlTextReaderDepth(reader : XMLTextReader) : Int
fun xmlTextReaderReadInnerXml(reader : XMLTextReader) : UInt8*
fun xmlTextReaderReadOuterXml(reader : XMLTextReader) : UInt8*
fun xmlTextReaderExpand(reader : XMLTextReader) : Node*
fun xmlTextReaderCurrentNode(reader : XMLTextReader) : Node*

fun xmlTextReaderSetErrorHandler(reader : XMLTextReader, f : TextReaderErrorFunc) : Void

89 changes: 87 additions & 2 deletions src/xml/reader.cr
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
require "./libxml2"

struct XML::Reader
# Creates a new reader from a string.
def initialize(str : String)
input = LibXML.xmlParserInputBufferCreateStatic(str, str.bytesize, 1)
@reader = LibXML.xmlNewTextReader(input, "")
@@ -11,6 +12,7 @@ struct XML::Reader
end
end

# Creates a new reader from an IO.
def initialize(io : IO)
input = LibXML.xmlParserInputBufferCreateIO(
->(context, buffer, length) { Box(IO).unbox(context).read(Slice.new(buffer, length)).to_i },
@@ -21,42 +23,125 @@ struct XML::Reader
@reader = LibXML.xmlNewTextReader(input, "")
end

# Moves the reader to the next node.
def read
LibXML.xmlTextReaderRead(@reader) == 1
end

# Moves the reader to the next node while skipping subtrees.
def next
LibXML.xmlTextReaderNext(@reader) == 1
end

# Moves the reader to the next sibling node while skipping subtrees.
def next_sibling
result = LibXML.xmlTextReaderNextSibling(@reader)
# Work around libxml2 with incomplete xmlTextReaderNextSibling()
# see: https://gitlab.gnome.org/GNOME/libxml2/issues/7
if result == -1
node = LibXML.xmlTextReaderCurrentNode(@reader)
if node.null?
LibXML.xmlTextReaderRead(@reader) == 1
elsif !node.value.next.null?
LibXML.xmlTextReaderNext(@reader) == 1
else
false
end
else
result == 1
end
end

# Returns the `XML::Type` of the node.
def node_type
LibXML.xmlTextReaderNodeType(@reader)
end

# Returns the name of the node.
def name
String.new(LibXML.xmlTextReaderConstName(@reader))
value = LibXML.xmlTextReaderConstName(@reader)
value ? String.new(value) : ""
end

# Checks if the node is an empty element.
def empty_element?
LibXML.xmlTextReaderIsEmptyElement(@reader) == 1
end

# Checks if the node has any attributes.
def has_attributes?
LibXML.xmlTextReaderHasAttributes(@reader) == 1
end

# Returns attribute count of the node.
def attributes_count
LibXML.xmlTextReaderAttributeCount(@reader)
end

# Moves to the first `XML::Type::ATTRIBUTE_NODE` of the node.
def move_to_first_attribute
LibXML.xmlTextReaderMoveToFirstAttribute(@reader) == 1
end

# Moves to the next `XML::Type::ATTRIBUTE_NODE` of the node.
def move_to_next_attribute
LibXML.xmlTextReaderMoveToNextAttribute(@reader) == 1
end

# Moves to the `XML::Type::ATTRIBUTE_NODE` with the specified name.
def move_to_attribute(name : String)
LibXML.xmlTextReaderMoveToAttribute(@reader, name) == 1
end

# Gets the attribute content for the *attribute* given by name.
# Raises `KeyError` if attribute is not found.
def [](attribute : String) : String
self[attribute]? || raise(KeyError.new("Missing attribute: #{attribute}"))
end

# Gets the attribute content for the *attribute* given by name.
# Returns `nil` if attribute is not found.
def []?(attribute : String) : String?
value = LibXML.xmlTextReaderGetAttribute(@reader, attribute)
String.new(value) if value
end

# Moves from the `XML::Type::ATTRIBUTE_NODE` to its containing `XML::Type::ELEMENT_NODE`.
def move_to_element
LibXML.xmlTextReaderMoveToElement(@reader) == 1
end

# Returns the current nesting depth of the reader.
def depth
LibXML.xmlTextReaderDepth(@reader)
end

# Returns the node's XML content including subtrees.
def read_inner_xml
xml = LibXML.xmlTextReaderReadInnerXml(@reader)
xml ? String.new(xml) : ""
end

# Returns the XML for the node and its content including subtrees.
def read_outer_xml
xml = LibXML.xmlTextReaderReadOuterXml(@reader)
xml ? String.new(xml) : ""
end

# Expand the node to a `XML::Node` that can be searched with XPath etc.
# The returned `XML::Node` is only valid until the next call to `#read`.
def expand
xml = LibXML.xmlTextReaderExpand(@reader)
XML::Node.new(xml) if xml
end

# Returns the text content of the node.
def value
String.new(LibXML.xmlTextReaderConstValue(@reader))
value = LibXML.xmlTextReaderConstValue(@reader)
value ? String.new(value) : ""
end

# Returns a reference to the underlying `LibXML::XMLTextReader`.
def to_unsafe
@reader
end
1 change: 1 addition & 0 deletions src/xml/type.cr
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
enum XML::Type
NONE = 0
ELEMENT_NODE = 1
ATTRIBUTE_NODE = 2
TEXT_NODE = 3

0 comments on commit 3696bb1

Please sign in to comment.