#!/usr/local/bin/perl -s # # Time-stamp: <95/09/15 02:32:32 rgs> # # Usage: htmlify [-quiet -width=n -nohyphen -<cde>header=Hn] file # "width" is the typical width of a line [default=72] # if "nohyphen" is not specified, assume that the document contains splits # unhyphenated words at syllable boundaries # "cheader", "dheader", "eheader" are the types of header (i.e. "H1") which # should be used for implicit headers formed by "all caps", "dash (-) # underlining", or "equal (=) underlining" # "smtp" is used to indicate that this file may contain headers from a mail # or netnews message. # "dlpattern" is a pattern which can be matched at the beginning of a line to # indicate that the line contains a "term" in a "definition list" # # This script was written by Robert Stockton (rgs@cs.cmu.edu) # This falls under the category of "quick hack". It is unlikely to be # especially readable or maintainable, nor can it be considered "industrial # strength". However, it has proven useful enough to be worth keeping # until something better comes along. # #If you are reading this via mosaic, it may be interpreted as html -- #it will probably be clearer if you read the "document source" instead.... $* = 1; # assume newlines in string searches $/ = ""; # paragraph mode $| = 1; # flush immediately $body = 0; $pindent = 0; $newpara = 0; defined($cheader) || ($cheader = "H2"); defined($dheader) || ($dheader = "H2"); defined($eheader) || ($eheader = "H1"); defined($sheader) || ($sheader = "H1"); $width || ($width = 72); $smtp || ($smtp = 0); # $dlpattern = "^[^ ].*:"; $curtag = "TEXT"; $title = "*** No Title ***"; $inbody = 0; if ($nohyphen) { $hstring = "^(.*[^-]-) *$"; } else { $hstring = "^(.*[^-])- *$"; } while (<>) { $oldtxt = ""; while ($_) { ($txt, $sep, $rest) = split(/^(.+\n\s*-+\s*\n|.+\n\s*\*+\s*\n|.+\n\s*=+\s*\n|[^a-z\n]*[A-Z][^a-z\n]*\n|--+\n|==+\n|\+\++\n|__+\n)/,$_,2); if ($sep) { while (($tab = index($sep, "\t")) >= 0) { substr($sep, $tab, 1) = (' ' x (8 - ($tab % 8))); } if ($dheader && $sep =~ /^( *)(.*[^ \n]) *\n *(-+) *\n/) { if (length($2) == length($3)) { $title = $2; &dotext($oldtxt . $txt); $oldtxt = ""; &prbreak("$1<$dheader>$2</$dheader>", *newpara); } else { &dotext($oldtxt . $txt . "$1$2\n"); $oldtxt = ""; &prbreak("<HR>", *newpara); } } elsif ($sheader && $sep =~ /^( *)(.*[^ \n]) *\n *(\*+) *\n/) { if (length($2) == length($3)) { $title = $2; &dotext($oldtxt . $txt); $oldtxt = ""; &prbreak("$1<$sheader>$2</$sheader>", *newpara); } else { &dotext($oldtxt . $txt . "$1$2\n"); $oldtxt = ""; &prbreak("<HR>", *newpara); } } elsif ($eheader && $sep =~ /^( *)(.*[^ \n]) *\n *(=+) *\n/) { if (length($2) == length($3)) { $title = $2; &dotext($oldtxt . $txt); $oldtxt = ""; &prbreak("$1<$eheader>$2</$eheader>", *newpara); } else { &dotext($oldtxt . $txt . "$1$2\n"); $oldtxt = ""; &prbreak("<HR>", *newpara); } } elsif ($cheader && $sep =~ /^( *)([^a-z\n]*[A-Z][^a-z\n]*)\n/) { $title = $2; $h = "$1<$cheader>$2</$cheader>"; &dotext($oldtxt . $txt); $oldtxt = ""; &prbreak($h, *newpara); } elsif ($sep =~ /^[ \t]*(--+\n|==+\n|\+\++\n|__+\n)$/) { &dotext($oldtxt . $txt); $oldtxt = ""; &prbreak("<HR>", *newpara); } else { $oldtxt = $oldtxt . $txt . $sep; } $_ = $rest; } else { &dotext($oldtxt . $txt); $oldtxt = ""; $_ = ""; } } $newpara = 1; } $finish = $inbody ? "</BODY>\n</HTML>" : ""; &prbreak($finish, *newpara); exit; sub dotext { local($txt) = pop(@_); local($*) = 0; $txt =~ s/\&/&amp;/g; $txt =~ s/\>/&gt;/g; $txt =~ s/\</&lt;/g; if ($smtp && $txt =~ /^[^\t \n:]+:.*(\n[ \t].*|\n[^\t \n:]+:.*)*\n*$/) { # One attempt at coming up with clean headers -- a better implementation of # <DL COMPACT> might allow this. # $txt =~ s/\n([^\t \n:]+):/"<DT><B>$1<\/B>:<DD>"/eg; # $txt =~ s/^([^\t \n:]+):/"<DL COMPACT>\n<DT><B>$1<\/B>:<DD>"/e; # print("$txt\n</DL>\n"); ($txt =~ /subject:[\t ]*(.*)/i) && ($title = $1); $txt =~ s/\n*$//g; &prline(0, $txt, *newpara, 0, "PRE"); return; } split(/\n/, $txt); $indent = 9999; $maxlen = 0; foreach $line (@_) { while (($tab = index($line, "\t")) >= 0) { substr($line, $tab, 1) = (' ' x (8 - ($tab % 8))); } $line =~ /^( *)(.*)/; (length($1) < $indent) && ($indent = length($1)); (length($2) > $maxlen) && ($maxlen = length($2)); } $line = $_[0]; $line =~ /^( *)/; $pindent = length($1) - $indent; $text = ""; $hyphen = 0; foreach $line (@_) { $line =~ /^( *)(.*)/; $lindent = length($1); $hyphen && ($line = $text . $2); $text = $2; if (($pindent != 0) && ($lindent == $pindent)) { $lindent = $indent unless defined($lihack); $newpara = 1; } $hyphen = 0; if ($line =~ /^ *$/ || ($smtp && $line =~ /^(([ :+|]|&gt;)*(&gt;|[:+|])) *$/)) { $newpara = 1; } elsif ($dlpattern && $line =~ /$dlpattern/o) { $dlrest = $'; &prline($lindent, "<DT>" . ' ' x (length($1)-4) . "$&", *newpara, 0, "DL"); &prline(length($&), "<DD>$dlrest", 0, *newpara, "DD"); if ($dlrest eq "") { $dlhack = 1; } } elsif ($smtp && $line =~ /^(([ :+|]|&gt;)*(&gt;|[:+|]))/) { # should be BLOCKQUOTE, but it doesn't nest properly in Mosaic2.1 # &prline(length($1), "$'", *newpara, 0, "BLOCKQUOTE"); &prline(length($1), "$'", *newpara, 0, "UL"); } elsif ($line =~ /^( *[\(\[]?[0-9]+[\)\.\]] +)(.*)/) { &prline(length($1), "<LI>" . ' ' x (length($1)-4) . "$2", *newpara, 0, "OL"); $lihack = $lindent; } elsif ($line =~ /^( *[\[\(]?[*-]+[\)\]]? +)(.*)/) { &prline(length($1), "<LI>" . ' ' x (length($1)-4) . "$2", *newpara, 0, "UL"); $lihack = $lindent; } elsif (length($line) < ($width - 12)) { if ($line =~ /[\.\?\!\:\-][\)\]\"\']* *$/) { &prline($lindent, $line, *newpara); } else { &prline($lindent, $line, *newpara, 0, "PRE"); } } else { $hyphen = ($line =~ /$hstring/o); if ($hyphen) { print(STDERR "*** Unhyphenating line: $text\n") unless defined($quiet); $text = $1; } else { &prline($lindent, $line, *newpara); } } } } sub prline { local($indent, $text, *newpara, $newnewpara, $tag) = @_; print("<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n") unless $inbody++; if ($dlhack) { $indstack[$#indstack] = $indent unless $tag eq "DL"; $dlhack = 0; } if (!$newpara && defined($lihack) && $indent == $lihack) { $indent = $indstack[$#indstack]; } elsif ($newpara && $indent == $indstack[$#indstack]) { } else { undef($lihack); } $tag || ($tag = "TEXT"); $newnewpara || ($newnewpara = 0); while ($#indstack >= 0 && $indstack[$#indstack] > $indent) { if ($waspre) { print("</PRE>\n"); $waspre = 0; $newpara = 0; } pop(@indstack); $oldtag = pop(@tagstack); print("</$oldtag>\n") unless ($oldtag eq "TEXT" || $oldtag eq "DD"); } if ($tag eq "TEXT" && $#tagstack == -1) { push(@indstack, $indent); push(@tagstack, $tag); $newpara = 0; } if ($tag eq "PRE" || (($tag eq "TEXT") && ($indent > $indstack[$#indstack]))) { $dedent = substr($text, $indstack[$#indstack]); if ($waspre++) { print("\n") if $newpara; print("$dedent\n"); } else { print("<PRE>\n$dedent\n"); } $newpara = $newnewpara; return; } elsif ($waspre) { print("</PRE>\n"); $waspre = 0; $newpara = 0; } if ($#indstack > -1 && $indstack[$#indstack] == $indent) { if ($tag ne $tagstack[$#tagstack] && $tag ne "TEXT" && $tag ne "DD"){ $oldtag = pop(@tagstack); print("</$oldtag>\n"); print("<P>\n") if $newpara; push(@tagstack, $tag); print("<$tag>\n"); } else { print("<P>\n") if $newpara; } } else { push(@indstack, $indent); push(@tagstack, $tag); print("<P>\n") if $newpara; print("<$tag>\n") unless $tag eq "DD"; } print("$text\n"); $newpara = $newnewpara; } sub prbreak { local($text, *newpara) = @_; print("<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n") unless $inbody++; $dlhack = 0; while ($#indstack >= 0) { if ($waspre) { print("</PRE>\n"); $waspre = 0; } pop(@indstack); $oldtag = pop(@tagstack); print("</$oldtag>\n") unless ($oldtag eq "TEXT" || $oldtag eq "DD"); } print("$text\n"); $newpara = 0; } #</plaintext>