Skip to content

Commit

Permalink
fixed table parsing bug in Mojo::DOM::HTML
Browse files Browse the repository at this point in the history
  • Loading branch information
kraih committed Sep 17, 2013
1 parent ed95057 commit 0ffffb2
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 31 deletions.
1 change: 1 addition & 0 deletions Changes
@@ -1,5 +1,6 @@

4.40 2013-09-18
- Fixed table parsing bug in Mojo::DOM::HTML.

4.39 2013-09-17
- Improved HTML5.1 compliance of Mojo::DOM::HTML.
Expand Down
48 changes: 21 additions & 27 deletions lib/Mojo/DOM/HTML.pm
Expand Up @@ -49,14 +49,14 @@ my $TOKEN_RE = qr/
)??
/xis;

# Elements that break HTML paragraphs
# HTML elements that break paragraphs
my %PARAGRAPH = map { $_ => 1 } (
qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
);

# HTML table elements
my %TABLE = map { $_ => 1 } qw(col colgroup tbody td th thead tr);
# HTML table elements with optional end tags
my %TABLE = map { $_ => 1 } qw(col colgroup tbody td tfoot th thead tr);

# HTML elements without end tags
my %VOID = map { $_ => 1 } (
Expand Down Expand Up @@ -150,18 +150,12 @@ sub parse {
sub render { $_[0]->_render($_[0]->tree) }

sub _close {
my ($self, $current, $tags, $stop) = @_;
$tags ||= \%TABLE;
$stop ||= 'table';
my ($self, $current, $allowed, $target) = @_;

# Check if parents need to be closed
# Close allowed parent elements until reaching target tag
my $parent = $$current;
while ($parent->[0] ne 'root' && $parent->[1] ne $stop) {

# Close
$tags->{$parent->[1]} and $self->_end($parent->[1], $current);

# Try next
while ($parent->[0] ne 'root' && $parent->[1] ne $target) {
$self->_end($parent->[1], $current) if $allowed->{$parent->[1]};
$parent = $parent->[3];
}
}
Expand Down Expand Up @@ -195,7 +189,7 @@ sub _end {
if ($end eq $$current->[1]) { return $$current = $$current->[3] }

# Table
elsif ($end eq 'table') { $self->_close($current) }
elsif ($end eq 'table') { $self->_close($current, \%TABLE, $end) }

# Missing end tag
$self->_end($$current->[1], $current);
Expand Down Expand Up @@ -271,40 +265,40 @@ sub _start {
# Autoclose optional HTML elements
if (!$self->xml && $$current->[0] ne 'root') {

# "<li>"
# "li"
if ($start eq 'li') { $self->_close($current, {li => 1}, 'ul') }

# "<p>"
# "p"
elsif ($PARAGRAPH{$start}) { $self->_end('p', $current) }

# "<head>"
# "head"
elsif ($start eq 'body') { $self->_end('head', $current) }

# "<optgroup>"
# "optgroup"
elsif ($start eq 'optgroup') { $self->_end('optgroup', $current) }

# "<option>"
# "option"
elsif ($start eq 'option') { $self->_end('option', $current) }

# "<colgroup>", "<thead>", "tbody" and "tfoot"
# "colgroup", "thead", "tbody" and "tfoot"
elsif (grep { $_ eq $start } qw(colgroup thead tbody tfoot)) {
$self->_close($current);
$self->_close($current, \%TABLE, 'table');
}

# "<tr>"
elsif ($start eq 'tr') { $self->_close($current, {tr => 1}) }
# "tr"
elsif ($start eq 'tr') { $self->_close($current, {tr => 1}, 'table') }

# "<th>" and "<td>"
# "th" and "td"
elsif ($start eq 'th' || $start eq 'td') {
$self->_close($current, {$_ => 1}) for qw(th td);
$self->_close($current, {$_ => 1}, 'table') for qw(th td);
}

# "<dt>" and "<dd>"
# "dt" and "dd"
elsif ($start eq 'dt' || $start eq 'dd') {
$self->_end($_, $current) for qw(dt dd);
}

# "<rt>" and "<rp>"
# "rt" and "rp"
elsif ($start eq 'rt' || $start eq 'rp') {
$self->_end($_, $current) for qw(rt rp);
}
Expand Down
12 changes: 8 additions & 4 deletions t/mojo/dom.t
Expand Up @@ -1255,12 +1255,12 @@ $dom = Mojo::DOM->new->parse(<<EOF);
<tr>
<th>A</th>
<th>D
<tbody>
<tr>
<td>B
<tfoot>
<tr>
<td>C
<tbody>
<tr>
<td>B
</table>
EOF
is $dom->at('table > thead > tr > th')->text, 'A', 'right text';
Expand All @@ -1286,6 +1286,9 @@ $dom = Mojo::DOM->new->parse(<<EOF);
<tbody>
<tr>
<td>B
<tbody>
<tr>
<td>E
</table>
EOF
is $dom->find('table > col')->[0]->attr->{id}, 'morefail', 'right attribute';
Expand All @@ -1298,7 +1301,8 @@ is $dom->find('table > colgroup > col')->[2]->attr->{id}, 'bar',
'right attribute';
is $dom->at('table > thead > tr > th')->text, 'A', 'right text';
is $dom->find('table > thead > tr > th')->[1]->text, 'D', 'right text';
is $dom->at('table > tbody > tr > td')->text, 'B', 'right text';
is $dom->at('table > tbody > tr > td')->text, 'B', 'right text';
is $dom->find('table > tbody > tr > td')->text, "B\nE", 'right text';

# Optional "colgroup", "tbody", "tr", "th" and "td" tags
$dom = Mojo::DOM->new->parse(<<EOF);
Expand Down

0 comments on commit 0ffffb2

Please sign in to comment.