#!/usr/bin/perl -w use strict; use HTML::TreeBuilder; use HTML::Entities; my $use_tidy = 1; my $ldquo = decode_entities "“"; my $rdquo = decode_entities "”"; my @block_elts = # from http://www.w3.org/TR/CSS21/sample.html qw(html address blockquote body dd div dl dt fieldset form frame frameset h1 h2 h3 h4 h5 h6 noframes ol p ul center dir hr menu pre); push @block_elts, 'br'; for my $file (@ARGV ? @ARGV : []) { my $parser = HTML::TreeBuilder->new; $parser->ignore_unknown(0); $parser->store_declarations(1); my $fh; unless (ref $file) { open($fh, "<:utf8", $file) or die "$0: error opening $file: $!\n"; } else { $fh = *DATA; $file = ''; } my $tree = $parser->parse_file($fh); # HTML::TreeBuilder(3pm): It is somewhat of a known bug (to be fixed # one of these days, if anyone needs it?) that declarations in the # preamble (before the "html" start-tag) end up actually under the # "html" element. my $doctype_declaration = ''; for ($tree->look_down(_tag => '~declaration', text => qr/^DOCTYPE/)) { $doctype_declaration = $_->as_HTML; $_->delete; } # insert a newline in block elements for matching against my @newlines = map { $_->splice_content(0,0,"\n"); [$_->content_refs_list]->[0] } $tree->find(@block_elts); $tree->objectify_text(); my @text_els = $tree->look_down(_tag => '~text'); my $text = join '', map { $_->attr('text') } @text_els; my ($i, $seen, $added) = (0, 0, 0); ######################################################### the regex: my $before = decode_entities("–—")."\\s(\[żĄ"; # my $after = decode_entities("–—")."\\s[[:punct:]]"; my $after = decode_entities("–—")."\\s:;,.!?)\]"; while ($text =~ m/( (?= $seen + length($text_els[$i]->attr('text')) - $added) { $seen += length $text_els[$i++]->attr('text'); } die if $i > @text_els; my $t = $text_els[$i]->attr('text'); my $origlen = length $t; ############################### the substitution: substr $t, $p - $seen + $added, length $&, $1 ? $ldquo : $rdquo; ############################### $added += length($t) - $origlen; $text_els[$i]->attr('text', $t); } # remove the newline inserted above $_->delete for map { $$_ } @newlines; $tree->deobjectify_text(); open(STDOUT, '|tidy -qi') if $use_tidy; print $doctype_declaration, $tree->as_HTML; next; } __DATA__ test

"this "is" a (empty quote:) "" test--"

"or any" tags
"except
" "<br>"

dash—"quotes" here

"quoted (threequotes: """) block;"

"But punctuation goes inside quotes", he said: "semicolon"; that is, "..."

("What about parentheses?")

but what about 3.5" floppies and " mis"takes?

let them use 3.5″ floppies!