txt2html
Wie angedeutet besteht mein Hobby darin, vergriffene Bücher zu scannen und nach Epub zu konvertieren. Der Scanvorgang (Tesseract) produziert ein Textdokument mit „abgetippten“ Einzelzeilen. Zum Zusammenfügen der Silbentrennung dient das Tool txt2html.tcl. Einzelne Kommentare und Metadaten im produzierten HTML dienen meinem persönlichen Bedarf, stören aber nicht weiter.
#!/usr/bin/tclsh
lassign $argv infile outfile
proc echo args {puts $args}
proc cat {file {encoding ""}} {
# return contents of $file
set port [open $file]
if {$encoding ne ""} then {
fconfigure $port -encoding $encoding
}
set contents [read $port]
close $port
set contents
}
proc saveString {string file} {
set port [open $file w]
puts -nonewline $port $string
close $port
}
proc swiss? txt {
set ltIndices [regexp -inline -indices {[«‹<]} $txt]
if {$ltIndices eq ""} then {
return false
} else {
lassign {*}$ltIndices ltIdx
}
set gtIndices [regexp -inline -indices {[»›>]} $txt]
if {$gtIndices eq ""} then {
return false
} else {
lassign {*}$gtIndices gtIdx
}
if {$ltIdx < $gtIdx} then {
return true
} else {
return false
}
}
regsub -all {[-–—]+} [cat $infile] - txt
regsub -all {(\s*[.]){2,}} $txt " ..." txt
set txt [string map "\ufb00 ff \ufb01 fi \ufb02 fl \ufb03 ffi \ufb04 ffl" $txt]
if {[swiss? $txt]} then {
set txt [string map {>>> ›» <<< «‹ >> » << « > › < ‹ & &} $txt]
} else {
set txt [string map {>>> »› <<< ‹« >> » << « > › < ‹ & &} $txt]
}
# regsub -all {(?:\s+[[:punct:]|lI])+\n} $txt \n txt
# regsub -all {\n(?:[[:punct:]|lI]\s+)+} $txt \n txt
regsub -all {\n +} $txt \n txt
regsub -all {\n{3,}} $txt \n\n txt
regsub -all {([AEIOUÄÖÜaeiouäöü])k-\nk([aeiouäöü])} $txt {\1ck\2} txt
regsub -all {([a-zäöüß])-\n([a-zäöüß])} $txt {\1\2} txt
regsub -all {(\S) *\n(\S)} $txt {\1 \2} txt
regsub -all {([[:punct:][:space:]])-([[:space:][:punct:]])} $txt {\1–\2} txt
regsub -all {\[(\S+)\s+"([^"]*)"\]} $txt {<a href="\1">\2</a>} txt
set txt "<p> [string map [list \n\n " </p>\n\n<p> "] [string trim $txt]] </p>"
append top {<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de" lang="de">}\
\n <head> \n\
{<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />} \n\
<title> \n\
[file tail [file root $infile]] \n\
</title> \n\
{<meta name="author" content="Unknown" />} \n\n\
{<meta name="DC.language" content="de" scheme="DCTERMS.RFC3066" />} \n\n\
{<meta name="splitlevel" content="1" />} \n\n\
{<meta name="wordmap" content="" />} \n\n\
{<meta name="stringmap" content=" ʼ ’\
₀ <sub>0</sub> ₁ <sub>1</sub> ₂ <sub>2</sub> ₃ <sub>3</sub> ₄ <sub>4</sub>\
₅ <sub>5</sub> ₆ <sub>6</sub> ₇ <sub>7</sub> ₈ <sub>8</sub> ₉ <sub>9</sub>\
⁰ <sup>0</sup> ¹ <sup>1</sup> ² <sup>2</sup> ³ <sup>3</sup> ⁴ <sup>4</sup>\
⁵ <sup>5</sup> ⁶ <sup>6</sup> ⁷ <sup>7</sup> ⁸ <sup>8</sup> ⁹ <sup>9</sup> "} \n\n\
{<meta name="hyphen" content="" />} \n\n\
{<style type=text/css>
#cover-image img,
p+img {
display: block;
max-height: 100%;
max-width: 100%;
}
hr {
border-width: 0px;
height: 0px;
}
h1 {
line-height: 120%;
border-width: 1pt;
border-bottom-style: solid;
padding-bottom: 1em;
width: auto;
}
h1, h2, h3, h4, h5, h6 {
text-align: center;
page-break-before: always;
page-break-after: avoid;
font-size: 100%;
}
h1 + div + h2, h2 + div + h3,
h3 + div + h4, h4 + div + h5, h5 + div + h6,
h1 + h2, h2 + h3, h3 + h4, h4 + h5, h5 + h6 {
page-break-before: avoid;
}
p, li {
margin-top: 0.5em;
margin-bottom: 0px;
}
p {
-moz-hyphens: auto;
-o-hyphens: auto;
-webkit-hyphens: auto;
-ms-hyphens: auto;
hyphens: auto;
text-align: justify;
}
li {
text-align: justify;
}
body > div > div > p+p,
blockquote > p+p,
body > p+p,
li > p+p {
text-indent: 1.25em;
margin-top: 0px;
}
p+* {
margin-top: 0em;
}
tt, code, pre {
font-size: 0.9em;
}
pre {
line-height: 0.9em;
page-break-inside: avoid;
}
body {
padding: 0.5em;
}
sup {
font-size: 80%;
display: inline;
line-height: 50%;
position: relative;
top: -0.1em;
}
ol[type="a"] {
list-style-type: lower-alpha;
}
ol[type="1"] {
list-style-type: decimal;
}
ul, ol, dl, blockquote {
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-right: 0em;
}
dd {
page-break-before: avoid;
}
p+blockquote {
page-break-before: avoid;
}
div p {
page-break-inside: avoid;
}
div p.ebenda {
page-break-before: avoid;
}
</style>} \n\n\
</head> \n {<!--
- Ligaturen
- Staub
- Ziffern zwischen Buchstaben
- Öffnende,
- Schließende Gänsefüßchen
- Satz-Enden
- Seitenwechsel
- Kommentare innerhalb von Wörtern
- Rechtschreibkontrolle
- Korrekturlesen
- Abschnitte
- Hervorhebungen
-->} \n {<body>} \n
append bottom \n </body> \n </html>
set txt $top$txt$bottom
if {$outfile ne ""} then {
saveString $txt $outfile
} else {
puts $txt
}
19.10.2022
<< | Heimatseite | Verzeichnis | Stichworte | Autor | >>