#!/usr/bin/perl -w
use strict;
use HTML::TreeBuilder;
use HTML::Entities;
my $use_tidy = 1;
my $ldquo = decode_entities "“";
my $rdquo = decode_entities "”";
my @block_elts = # from http://www.w3.org/TR/CSS21/sample.html
qw(html address blockquote body dd div dl dt fieldset form frame
frameset h1 h2 h3 h4 h5 h6 noframes ol p ul center dir hr menu pre);
push @block_elts, 'br';
for my $file (@ARGV ? @ARGV : []) {
my $parser = HTML::TreeBuilder->new;
$parser->ignore_unknown(0);
$parser->store_declarations(1);
my $fh;
unless (ref $file) {
open($fh, "<:utf8", $file) or die "$0: error opening $file: $!\n";
} else {
$fh = *DATA;
$file = '';
}
my $tree = $parser->parse_file($fh);
# HTML::TreeBuilder(3pm): It is somewhat of a known bug (to be fixed
# one of these days, if anyone needs it?) that declarations in the
# preamble (before the "html" start-tag) end up actually under the
# "html" element.
my $doctype_declaration = '';
for ($tree->look_down(_tag => '~declaration', text => qr/^DOCTYPE/)) {
$doctype_declaration = $_->as_HTML;
$_->delete;
}
# insert a newline in block elements for matching against
my @newlines =
map { $_->splice_content(0,0,"\n"); [$_->content_refs_list]->[0] }
$tree->find(@block_elts);
$tree->objectify_text();
my @text_els = $tree->look_down(_tag => '~text');
my $text = join '', map { $_->attr('text') } @text_els;
my ($i, $seen, $added) = (0, 0, 0);
######################################################### the regex:
my $before = decode_entities("–—")."\\s(\[żĄ";
# my $after = decode_entities("–—")."\\s[[:punct:]]";
my $after = decode_entities("–—")."\\s:;,.!?)\]";
while ($text =~ m/( (?= $seen + length($text_els[$i]->attr('text')) - $added) {
$seen += length $text_els[$i++]->attr('text');
}
die if $i > @text_els;
my $t = $text_els[$i]->attr('text');
my $origlen = length $t;
############################### the substitution:
substr $t, $p - $seen + $added,
length $&,
$1 ? $ldquo : $rdquo;
###############################
$added += length($t) - $origlen;
$text_els[$i]->attr('text', $t);
}
# remove the newline inserted above
$_->delete for map { $$_ } @newlines;
$tree->deobjectify_text();
open(STDOUT, '|tidy -qi') if $use_tidy;
print $doctype_declaration, $tree->as_HTML;
next;
}
__DATA__
"this "is" a (empty quote:) "" test--" "or any" tags dash—"quotes" here
"except
" "<br>""quoted (threequotes: """) block;"
"But punctuation goes inside quotes", he said: "semicolon"; that is, "..."
("What about parentheses?")
but what about 3.5" floppies and " mis"takes?
let them use 3.5″ floppies!