Skip to content

Commit

Permalink
improved Mojo::DOM::HTML to sometimes recover from bad charset inform…
Browse files Browse the repository at this point in the history
…ation
  • Loading branch information
kraih committed Aug 21, 2012
1 parent 3a828f0 commit c6ba823
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 14 deletions.
4 changes: 3 additions & 1 deletion Changes
@@ -1,7 +1,9 @@

3.33 2012-08-21
- Improved resilience of Mojo::DOM::HTML.
- Improved Mojo::DOM::HTML to sometimes recover from bad charset
information.
- Improved documentation.
- Improved tests.

3.32 2012-08-20
- Added event sequentialization support to delay method in Mojo::IOLoop.
Expand Down
11 changes: 8 additions & 3 deletions lib/Mojo/DOM/HTML.pm
Expand Up @@ -81,9 +81,14 @@ my %INLINE = map { $_ => 1 } @HTML4_INLINE, @HTML5_INLINE;
sub parse {
my ($self, $html) = @_;

# Try to decode
my $charset = $self->charset;
$html = decode($charset, $html) // $html if $charset;
# Try to decode (and fall back to no charset)
if (my $charset = $self->charset) {
my $backup = $html;
unless (defined($html = decode $charset, $html)) {
$html = $backup;
$self->charset(undef);
}
}

# Tokenize
my $tree = ['root'];
Expand Down
29 changes: 19 additions & 10 deletions t/mojo/dom.t
Expand Up @@ -2,7 +2,7 @@ use Mojo::Base -strict;

use utf8;

use Test::More tests => 787;
use Test::More tests => 790;

# "Homer gave me a kidney: it wasn't his, I didn't need it,
# and it came postage due- but I appreciated the gesture!"
Expand Down Expand Up @@ -179,7 +179,7 @@ is $dom->at('script')->text, "alert('lalala');", 'right script content';

# HTML5 (unquoted values)
$dom = Mojo::DOM->new->parse(
qq#<div id = test foo ="bar" class=tset>works</div>#);
qq{<div id = test foo ="bar" class=tset>works</div>});
is $dom->at('#test')->text, 'works', 'right text';
is $dom->at('div')->text, 'works', 'right text';
is $dom->at('[foo="bar"]')->text, 'works', 'right text';
Expand All @@ -190,7 +190,7 @@ is $dom->at('.tset')->text, 'works', 'right text';

# HTML1 (single quotes, upper case tags and whitespace in attributes)
$dom = Mojo::DOM->new->parse(
qq#<DIV id = 'test' foo ='bar' class= "tset">works</DIV>#);
qq{<DIV id = 'test' foo ='bar' class= "tset">works</DIV>});
is $dom->at('#test')->text, 'works', 'right text';
is $dom->at('div')->text, 'works', 'right text';
is $dom->at('[foo="bar"]')->text, 'works', 'right text';
Expand All @@ -211,7 +211,7 @@ is $dom->at('[id="snowm\000021 an"]'), undef, 'no result';

# Unicode and escaped selectors
my $chars
= qq#<html><div id="☃x">Snowman</div><div class="x ♥">Heart</div></html>#;
= qq{<html><div id="☃x">Snowman</div><div class="x ♥">Heart</div></html>};
my $bytes = encode 'UTF-8', $chars;
$dom = Mojo::DOM->new->charset('UTF-8');
$dom->parse($bytes);
Expand Down Expand Up @@ -564,25 +564,25 @@ is $dom->at('[foo="bar"]')->attrs('class'), 'x', 'right attribute';

# Markup characters in attribute values
$dom = Mojo::DOM->new->parse(
qq#<div id="<a>" \n test='='>Test<div id='><' /></div>#);
qq{<div id="<a>" \n test='='>Test<div id='><' /></div>});
is $dom->at('div[id="<a>"]')->attrs->{test}, '=', 'right attribute';
is $dom->at('[id="<a>"]')->text, 'Test', 'right text';
is $dom->at('[id="><"]')->attrs->{id}, '><', 'right attribute';

# Empty attributes
$dom = Mojo::DOM->new->parse(qq#<div test="" test2='' />#);
$dom = Mojo::DOM->new->parse(qq{<div test="" test2='' />});
is $dom->at('div')->attrs->{test}, '', 'empty attribute value';
is $dom->at('div')->attrs->{test2}, '', 'empty attribute value';

# Whitespaces before closing bracket
$dom = Mojo::DOM->new->parse(qq#<div >content</div>#);
$dom = Mojo::DOM->new->parse(qq{<div >content</div>});
ok $dom->at('div'), 'tag found';
is $dom->at('div')->text, 'content', 'right text';
is $dom->at('div')->content_xml, 'content', 'right text';

# Class with hyphen
$dom
= Mojo::DOM->new->parse(qq#<div class="a">A</div><div class="a-1">A1</div>#);
= Mojo::DOM->new->parse(qq{<div class="a">A</div><div class="a-1">A1</div>});
@div = ();
$dom->find('.a')->each(sub { push @div, shift->text });
is_deeply \@div, ['A'], 'found first element only';
Expand Down Expand Up @@ -1847,7 +1847,7 @@ is $dom->a->B->c->size, 2, 'right number of elements';
@results = ();
$dom->a->B->c->each(sub { push @results, $_->text });
is_deeply \@results, [qw(bar baz)], 'right results';
is $dom->a->B->c, qq#<c id="three">bar</c>\n<c ID="four">baz</c>#,
is $dom->a->B->c, qq{<c id="three">bar</c>\n<c ID="four">baz</c>},
'right result';
is_deeply [keys %$dom], [], 'root has no attributes';
is $dom->find('#nothing'), '', 'no result';
Expand Down Expand Up @@ -1883,7 +1883,7 @@ is $dom->a->b->c->size, 2, 'right number of elements';
@results = ();
$dom->a->b->c->each(sub { push @results, $_->text });
is_deeply \@results, [qw(bar baz)], 'right results';
is $dom->a->b->c, qq#<c id="three">bar</c>\n<c id="four">baz</c>#,
is $dom->a->b->c, qq{<c id="three">bar</c>\n<c id="four">baz</c>},
'right result';
is_deeply [keys %$dom], [], 'root has no attributes';
is $dom->find('#nothing'), '', 'no result';
Expand Down Expand Up @@ -2128,3 +2128,12 @@ is $dom->find('div > ul li')->[1]->text, 'B', 'right text';
is $dom->find('div > ul li')->[2], undef, 'no result';
is $dom->find('div > ul ul')->[0]->text, 'C', 'right text';
is $dom->find('div > ul ul')->[1], undef, 'no result';

# Recover from bad charset
$bytes = encode 'UTF-8',
qq{<html><div id="a">A</div><div class="b">♥</div></html>};
$dom = Mojo::DOM->new->charset('doesnotexist');
$dom->parse($bytes);
is $dom->at('#a')->text, 'A', 'right text';
is $dom->at('.b')->text, encode('UTF-8', ''), 'right text';
is "$dom", $bytes, 'right result';

0 comments on commit c6ba823

Please sign in to comment.