Skip to content

Commit

Permalink
improved bad charset handling in Mojo::DOM::HTML
Browse files Browse the repository at this point in the history
  • Loading branch information
kraih committed Mar 17, 2013
1 parent 99dff6e commit 4eb1364
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 5 deletions.
1 change: 1 addition & 0 deletions Changes
@@ -1,5 +1,6 @@

3.91 2013-03-17
- Improved bad charset handling in Mojo::DOM::HTML.
- Improved tests.
- Fixed support for RFC 2817 in Mojo::Message::Request.
- Fixed whitespace bug in Mojo::DOM::HTML.
Expand Down
6 changes: 4 additions & 2 deletions lib/Mojo/DOM/HTML.pm
Expand Up @@ -76,8 +76,10 @@ my %INLINE = map { $_ => 1 } (
sub parse {
my ($self, $html) = @_;

my $charset = $self->charset;
$html = decode($charset, $html) // return $self->charset(undef) if $charset;
if (my $charset = $self->charset) {
if (my $chars = decode $charset, $html) { $html = $chars }
else { $self->charset(undef) }
}

my $tree = ['root'];
my $current = $tree;
Expand Down
12 changes: 9 additions & 3 deletions t/mojo/dom.t
Expand Up @@ -2144,9 +2144,15 @@ is "$dom", '<span>a</span><b>b</b><span>c</span>', 'right result';

# Bad charset
$dom = Mojo::DOM->new->charset('doesnotexist');
$dom->parse(qq{<html><div id="a">A</div></html>});
is $dom->at('#a'), undef, 'no result';
is "$dom", '', 'right result';
$dom->parse('<html><div id="a">A</div></html>');
is $dom->charset, undef, 'no charset';
is $dom->at('#a')->text, 'A', 'right text';
is "$dom", '<html><div id="a">A</div></html>', 'right result';
$dom = Mojo::DOM->new->charset('UTF-8');
$dom->parse(qq{<div id="invalid">\x89</div>});
is $dom->charset, undef, 'no charset';
is $dom->at('#invalid')->text, "\x89", 'right text';
is "$dom", qq{<div id="invalid">\x89</div>}, 'right result';

# Comments
$dom = Mojo::DOM->new(<<EOF);
Expand Down

0 comments on commit 4eb1364

Please sign in to comment.