Skip to content

Commit

Permalink
Improve munging of more bizarre non-pod
Browse files Browse the repository at this point in the history
  • Loading branch information
rwstauner committed Dec 14, 2014
1 parent 935e31e commit d683755
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 6 deletions.
25 changes: 19 additions & 6 deletions lib/MetaCPAN/Document/File.pm
Expand Up @@ -345,26 +345,39 @@ sub _build_pod {

# The pod parser is very liberal and will "start" a pod document when it
# sees /^=[a-zA-Z]/ even though it might be binary like /^=F\0?\{/.
# So strip out any lines that might match but are not lines we'd actually
# want in our pod text.
# So munge any lines that might match but are not usual pod directives
# that people would use (we don't need to index non-regular pod).
# Also see the test and comments in t/document/file.t for how
# bizarre constructs are handled.

$content =~ s/
# Pod::Simple::parse_string_document() "supports \r, \n ,\r\n"...
(\A|\r|\r\n|\n) # beginning of line
(?:
\A|\r|\r\n|\n) # beginning of line
\K # (keep those characters)
(
=[a-zA-Z][a-zA-Z0-9]* # looks like pod
(?! # but followed by something that isn't pod:
[a-zA-Z0-9] # more pod chars (the star won't be greedy enough)
| \s # whitespace ("=head1 NAME\n")
| $ # end of line ("=item\n"
| \s # whitespace ("=head1 NAME\n", "=item\n")
| \Z # end of line or end of doc
)
//gx;
)
# Prefix (to hide from Pod parser) instead of removing.
/\0$1/gx;

my $text = "";
$parser->output_string( \$text );
$parser->parse_string_document($content);
$text =~ s/\s+/ /g;
$text =~ s/ \z//;

# Remove any markers we put in the text.
# Should we remove other non-regular bytes that may come from the source?
$text =~ s/\0//g;

return \$text;
}

Expand Down
71 changes: 71 additions & 0 deletions t/document/file.t
Expand Up @@ -433,4 +433,75 @@ END
};
};

subtest 'pod intermixed with non-pod gibberish' => sub {

# This is totally made up in an attempt to see how we handle gibberish.
# The decisions of the handling are open to discussion.

my $badpod = <<BADPOD;
some\r=nonpod=ahem
=moreC<=notpod>
=head1[but no space]
BADPOD

my $content = <<END;
package Yo;
print <<OUTSIDE_OF_POD;
$badpod
OUTSIDE_OF_POD
=head1 Start-Pod
$badpod
last-word.
=cut
"code after pod";
END

my $file = MetaCPAN::Document::File->new(
%stub,
name => 'Yo.pm',
content_cb => sub { \$content }
);

test_attributes $file, {
sloc => 7,
slop => 6,
pod_lines => [ [ 13, 12 ], ],

# What *should* this parse to?
# * No pod before "Start-Pod".
# * The /^some/ line starts with "some" so the whole line is just text.
# ** Pod::Simple will catch the /\r=[a-z]/ and treat it as a directive:
# *** We probably don't want to remove the line start chars (/\r?\n?/)
# (or we'll throw off lines/blanks/etc...).
# *** If we keep the "\r" but remove the fake directive,
# the "\r" will touch the "=ahem" and the pod document will *start*
# and we'll get lots of text before the pod should start.
# *** So keep everything but mark them so Pod::Simple will skip them.
# ** The "\r" will count as "\s" and get squeezed into a single space.
# * So if /^=moreC/ is kept the <notpod> will retain the C.
# * When Pod::Simple sees /^head1\[/ it will start the pod document but
# it won't be a heading, it will just be text (along with everything after)
# which obviously was not the intention of the author. So as long as
# the author made a mistake and needs to fix pod:
# ** In the code, if we hide the "invalid" pod then we won't get the whole rest
# of the file being erroneously treated as pod.
# ** Inside the pod, if we left it alone, Pod::Simple would just dump it as
# text. If we mark it, the same thing will happen.

pod =>
q{Start-Pod some =nonpod=ahem =more"=notpod" =head1[but no space] last-word.},
};
};

done_testing;

0 comments on commit d683755

Please sign in to comment.