Skip to content

Commit

Permalink
fix unescaping of HTML5 attribute values in Mojo::DOM::HTML (closes #…
Browse files Browse the repository at this point in the history
  • Loading branch information
kraih committed Mar 9, 2017
1 parent 900d81b commit 852a71a
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 24 deletions.
4 changes: 3 additions & 1 deletion Changes
@@ -1,5 +1,7 @@

7.29 2017-03-08
7.29 2017-03-09
- Added html_attr_unescape function to Mojo::Util.
- Fixed unescaping of HTML5 attribute values in Mojo::DOM::HTML.

7.28 2017-03-07
- Added copy_to, realpath and sibling methods to Mojo::File.
Expand Down
4 changes: 2 additions & 2 deletions lib/Mojo/DOM/HTML.pm
@@ -1,7 +1,7 @@
package Mojo::DOM::HTML;
use Mojo::Base -base;

use Mojo::Util qw(html_unescape xml_escape);
use Mojo::Util qw(html_attr_unescape html_unescape xml_escape);
use Scalar::Util 'weaken';

has tree => sub { ['root'] };
Expand Down Expand Up @@ -125,7 +125,7 @@ sub parse {
# Empty tag
++$closing and next if $key eq '/';

$attrs{$key} = defined $value ? html_unescape $value : $value;
$attrs{$key} = defined $value ? html_attr_unescape $value : $value;
}

# "image" is an alias for "img"
Expand Down
54 changes: 37 additions & 17 deletions lib/Mojo/Util.pm
Expand Up @@ -51,14 +51,17 @@ my %XML = (
# "Sun, 06 Nov 1994 08:49:37 GMT" and "Sunday, 06-Nov-94 08:49:37 GMT"
my $EXPIRES_RE = qr/(\w+\W+\d+\W+\w+\W+\d+\W+\d+:\d+:\d+\W*\w+)/;

# HTML entities
my $ENTITY_RE = qr/&(?:\#((?:[0-9]{1,7}|x[0-9a-fA-F]{1,6}));|(\w+[;=]?))/;

# Encoding cache
my %CACHE;

our @EXPORT_OK = (
qw(b64_decode b64_encode camelize class_to_file class_to_path decamelize),
qw(decode deprecated dumper encode extract_usage getopt hmac_sha1_sum),
qw(html_unescape md5_bytes md5_sum monkey_patch punycode_decode),
qw(punycode_encode quote secure_compare sha1_bytes sha1_sum),
qw(html_attr_unescape html_unescape md5_bytes md5_sum monkey_patch),
qw(punycode_decode punycode_encode quote secure_compare sha1_bytes sha1_sum),
qw(split_cookie_header split_header steady_time tablify term_escape trim),
qw(unindent unquote url_escape url_unescape xml_escape xor_encode)
);
Expand Down Expand Up @@ -155,12 +158,8 @@ sub getopt {
Getopt::Long::Configure($save);
}

sub html_unescape {
my $str = shift;
$str
=~ s/&(?:\#((?:[0-9]{1,7}|x[0-9a-fA-F]{1,6}));|(\w+;?))/_decode($1, $2)/ge;
return $str;
}
sub html_attr_unescape { _html(shift, 1) }
sub html_unescape { _html(shift, 0) }

# Declared in Mojo::Base to avoid circular require problems
sub monkey_patch { Mojo::Base::_monkey_patch(@_) }
Expand Down Expand Up @@ -367,25 +366,27 @@ sub _adapt {
return $k + (((PC_BASE - PC_TMIN + 1) * $delta) / ($delta + PC_SKEW));
}

sub _decode {
my ($point, $name) = @_;
sub _encoding {
$CACHE{$_[0]} //= find_encoding($_[0]) // croak "Unknown encoding '$_[0]'";
}

sub _entity {
my ($point, $name, $attr) = @_;

# Code point
return chr($point !~ /^x/ ? $point : hex $point) unless defined $name;

# Named character reference
my $rest = '';
my $rest = my $last = '';
while (length $name) {
return $ENTITIES{$name} . reverse $rest if exists $ENTITIES{$name};
$rest .= chop $name;
return $ENTITIES{$name} . reverse $rest
if exists $ENTITIES{$name}
&& (!$attr || $name =~ /;$/ || $last !~ /[A-Za-z0-9=]/);
$rest .= $last = chop $name;
}
return '&' . reverse $rest;
}

sub _encoding {
$CACHE{$_[0]} //= find_encoding($_[0]) // croak "Unknown encoding '$_[0]'";
}

# Supported on Perl 5.14+
sub _global_destruction {
defined ${^GLOBAL_PHASE} && ${^GLOBAL_PHASE} eq 'DESTRUCT';
Expand Down Expand Up @@ -418,6 +419,12 @@ sub _header {
return [@part ? (@tree, \@part) : @tree];
}

sub _html {
my ($str, $attr) = @_;
$str =~ s/$ENTITY_RE/_entity($1, $2, $attr)/geo;
return $str;
}

sub _options {

# Hash or name (one)
Expand Down Expand Up @@ -631,6 +638,19 @@ Generate HMAC-SHA1 checksum for bytes.
# "11cedfd5ec11adc0ec234466d8a0f2a83736aa68"
hmac_sha1_sum 'foo', 'passw0rd';
=head2 html_attr_unescape
my $str = html_attr_unescape $escaped;
Same as L</"html_unescape">, but handles special rules from the
L<HTML Living Standard|https://html.spec.whatwg.org> for HTML attributes.
# "foo=bar&ltest=baz"
html_attr_unescape 'foo=bar&ltest=baz';
# "foo=bar<est=baz"
html_attr_unescape 'foo=bar&lt;est=baz';
=head2 html_unescape
my $str = html_unescape $escaped;
Expand Down
12 changes: 12 additions & 0 deletions t/mojo/dom.t
Expand Up @@ -786,6 +786,18 @@ is $dom->at('.line1')->tag, 'div', 'right tag';
is $dom->at('.line2')->tag, 'div', 'right tag';
is $dom->at('.line3'), undef, 'no result';

# Entities in attributes
$dom = Mojo::DOM->new(qq{<a href="/?foo&lt=bar"></a>});
is $dom->at('a')->{href}, '/?foo&lt=bar', 'right attribute value';
$dom = Mojo::DOM->new(qq{<a href="/?f&ltoo=bar"></a>});
is $dom->at('a')->{href}, '/?f&ltoo=bar', 'right attribute value';
$dom = Mojo::DOM->new(qq{<a href="/?f&lt-oo=bar"></a>});
is $dom->at('a')->{href}, '/?f<-oo=bar', 'right attribute value';
$dom = Mojo::DOM->new(qq{<a href="/?foo=&lt"></a>});
is $dom->at('a')->{href}, '/?foo=<', 'right attribute value';
$dom = Mojo::DOM->new(qq{<a href="/?f&lt;oo=bar"></a>});
is $dom->at('a')->{href}, '/?f<oo=bar', 'right attribute value';

# Whitespaces before closing bracket
$dom = Mojo::DOM->new('<div >content</div>');
ok $dom->at('div'), 'tag found';
Expand Down
19 changes: 15 additions & 4 deletions t/mojo/util.t
Expand Up @@ -11,10 +11,10 @@ use Mojo::DeprecationTest;
use Mojo::Util
qw(b64_decode b64_encode camelize class_to_file class_to_path decamelize),
qw(decode dumper encode extract_usage getopt hmac_sha1_sum html_unescape),
qw(md5_bytes md5_sum monkey_patch punycode_decode punycode_encode quote),
qw(secure_compare sha1_bytes sha1_sum split_cookie_header split_header),
qw(steady_time tablify term_escape trim unindent unquote url_escape),
qw(url_unescape xml_escape xor_encode);
qw(html_attr_unescape md5_bytes md5_sum monkey_patch punycode_decode),
qw(punycode_encode quote secure_compare sha1_bytes sha1_sum),
qw(split_cookie_header split_header steady_time tablify term_escape trim),
qw(unindent unquote url_escape url_unescape xml_escape xor_encode);

# camelize
is camelize('foo_bar_baz'), 'FooBarBaz', 'right camelized result';
Expand Down Expand Up @@ -236,6 +236,17 @@ is html_unescape('foobar'), 'foobar', 'no changes';
is html_unescape('&0&Ltf&amp&0oo&nbspba;&ltr'), "&0&Ltf&&0oo\x{00a0}ba;<r",
'right HTML unescaped result';

# html_attr_unescape
is html_attr_unescape('/?foo&lt=bar'), '/?foo&lt=bar',
'right HTML unescaped result';
is html_attr_unescape('/?f&ltoo=bar'), '/?f&ltoo=bar',
'right HTML unescaped result';
is html_attr_unescape('/?f&lt-oo=bar'), '/?f<-oo=bar',
'right HTML unescaped result';
is html_attr_unescape('/?foo=&lt'), '/?foo=<', 'right HTML unescaped result';
is html_attr_unescape('/?f&lt;oo=bar'), '/?f<oo=bar',
'right HTML unescaped result';

# url_unescape (bengal numbers with nothing to unescape)
is html_unescape('&#০৩৯;&#x০৩৯;'), '&#০৩৯;&#x০৩৯;',
'no changes';
Expand Down

0 comments on commit 852a71a

Please sign in to comment.