Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
improved Mojo::DOM::HTML to handle bad charsets more gracefully
  • Loading branch information
kraih committed Aug 21, 2012
1 parent 9109d1f commit 9af2889
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 15 deletions.
3 changes: 1 addition & 2 deletions Changes
@@ -1,7 +1,6 @@

3.33 2012-08-21
- Improved Mojo::DOM::HTML to sometimes recover from bad charset
information.
- Improved Mojo::DOM::HTML to handle bad charsets more gracefully.
- Improved documentation.
- Improved tests.

Expand Down
7 changes: 2 additions & 5 deletions lib/Mojo/DOM/HTML.pm
Expand Up @@ -81,12 +81,9 @@ my %INLINE = map { $_ => 1 } @HTML4_INLINE, @HTML5_INLINE;
sub parse {
my ($self, $html) = @_;

# Try to decode (and fall back to no charset)
# Try to decode
my $charset = $self->charset;
if ($charset && !defined($html = decode $charset, my $backup = $html)) {
$html = $backup;
$self->charset(undef);
}
$html = decode($charset, $html) // return $self->charset(undef) if $charset;

# Tokenize
my $tree = ['root'];
Expand Down
13 changes: 5 additions & 8 deletions t/mojo/dom.t
Expand Up @@ -2,7 +2,7 @@ use Mojo::Base -strict;

use utf8;

use Test::More tests => 790;
use Test::More tests => 789;

# "Homer gave me a kidney: it wasn't his, I didn't need it,
# and it came postage due- but I appreciated the gesture!"
Expand Down Expand Up @@ -2129,11 +2129,8 @@ is $dom->find('div > ul li')->[2], undef, 'no result';
is $dom->find('div > ul ul')->[0]->text, 'C', 'right text';
is $dom->find('div > ul ul')->[1], undef, 'no result';

# Recover from bad charset
$bytes = encode 'UTF-8',
qq{<html><div id="a">A</div><div class="b">♥</div></html>};
# Bad charset
$dom = Mojo::DOM->new->charset('doesnotexist');
$dom->parse($bytes);
is $dom->at('#a')->text, 'A', 'right text';
is $dom->at('.b')->text, encode('UTF-8', ''), 'right text';
is "$dom", $bytes, 'right result';
$dom->parse(qq{<html><div id="a">A</div></html>});
is $dom->at('#a'), undef, 'no result';
is "$dom", '', 'right result';

0 comments on commit 9af2889

Please sign in to comment.