XML
Das Paket XML bietet einen elementaren XML-Parser. Es verarbeitet auch leere Tags, etwa <hr />, und Einschübe des Typs <![CDATA[ … ]]>. Per Voreinstellung ignoriert der Parser Leerzeichen; mit Schalter -space yes übernimmt er sie Eins zu Eins.
Das Resultat eines Parser-Laufs ist ein Dictionary mit den Schlüsseln type (element, pcdata oder comment), name, attribute, content.
Beispielsitzung:
% set src {<p class="heiter"> Hi! </p>}
<p class="heiter"> Hi! </p>
% set tree [xml parse $src]
type element name p attribute {class heiter} content {{type pcdata content Hi!}}
Auf die Variable tree wird zugegriffen mit [dict ... $tree ...].
Die Prozedur unparse rekonstruiert den Quelltext:
% xml unparse $tree <p class="heiter">Hi!</p>
Die Option -pp bewirkt übersichtliche Einrückung:
% xml unparse $tree -pp <p class="heiter"> Hi! </p> %
Schreibzugriff auf die Baumstruktur innerhalb content ist derzeit nicht geplant.
Den ersten Funktionstest erfuhr der Parser bei meinem Feed-Reader.
#
# XMl -- minimalistic but working XML parser
# usage: package require XML
#
# xml parse $src ?-space yes|no?
# xml findElementsByName $tree $name
# xml findElementById $tree $id
# xml getElement $tree ?n1 ?n2 ...??
# xml getText $tree ?n1 ?n2 ...??
# xml unparse $tree ?-pp?
#
# default: xml parse $src -space no
#
package require Tcl 8.6.1
package provide xml 0.3
namespace eval xml {
namespace export\
parse\
unparse\
getElement\
getText\
findElementById\
findElementsByName
}
namespace eval xml namespace import ::tcl::mathop::+
proc ::xml::tokenize txt {
set start 0
set result {}
while true {
set openIdx [string first < $txt $start]
if {$openIdx < 0} break
if {[regexp {[[:alpha:]/]} [string index $txt $openIdx+1]]} then {
set closeIdx [string first > $txt $openIdx]
lappend result [list $openIdx $closeIdx]
} elseif {[string range $txt $openIdx $openIdx+3] eq "<!--"} then {
set closeIdx [string first --> $txt $openIdx]
lappend result [list $openIdx [+ $closeIdx 2]]
} else {
incr start
continue
}
set start $closeIdx
}
set result
}
proc ::xml::parse {src args} {
set option [dict merge {
-space no
} {*}[lmap {a b} $args {
if {$a ni { -space }} then {
return -code error [list unknown option: $a -- {*}[info level 0]]
}
list $a $b
}]]
#
# encode <![CDATA[http://www.taz.de//!p4608/]]>
set cdataMap [concat {*}[lsort -unique [lmap {a b}\
[regexp -inline -all {<!\[CDATA\[(.*?)\]\]>} $src] {
list $a [string map {
]\] ]]
]\> ]>
< <
& &
> >
} $b]
}]]]
set src [string map $cdataMap $src]
set pairs [tokenize $src]
#
# set pairs [tokenize $src]
#
# list of tag strings
set tagList [lmap pair $pairs {string range $src {*}$pair}]
# indices of pcdata
set strIdx [lrange [concat {*}$pairs] 1 end-1]
# list of pcdata strings
set strList {}
foreach {i j} $strIdx {
lappend strList [string range $src $i+1 $j-1]
}
# tokens alternating: tag, pcdata, tag, ...
set tokens {}
foreach tag $tagList str $strList {
lappend tokens $tag
if {[dict get $option -space]} then {
if {$str ne ""} then {
lappend tokens $str
}
} else {
if {![string is space $str]} then {
lappend tokens [string trim $str]
}
}
}
while {
[string match <!--*--> [lindex $tokens 0]] ||
[string match <\\?*\\?> [lindex $tokens 0]]
} {
set tokens [lrange $tokens 1 end]
}
# process list
tokensVarToTree tokens
}
proc ::xml::tokensVarToTree _tokens {
upvar $_tokens tokens
set tokens [lassign $tokens token]
if {[regexp {^<[[:alpha:]][^>]*>$} $token]} then {
#
# token is opening TAG
#
regexp {[[:alnum:]]+} $token name
set result [dict create type element name $name attribute "" content ""]
set attList\
[regexp -inline -all {([[:alnum:]]+)="([^"]*)"} $token]
lappend attList\
{*}[regexp -inline -all {([[:alnum:]]+)='([^']*)'} $token]
foreach {match att val} $attList {
dict set result attribute $att $val
}
dict set result content {}
if {![regexp {/\s*>} $token]} then {
#
# non-empty TAG
#
while {
[llength $tokens] > 0 &&
![string match </* [lindex $tokens 0]]
} {
dict lappend result content [tokensVarToTree tokens]
}
set tokens [lrange $tokens 1 end]
}
set result
} elseif {[string match <!--*--> $token]} then {
#
# token is COMMENT
#
dict create type comment content [string range $token 4 end-3]
} else {
#
# token is PCDATA
#
dict create type pcdata content $token
}
}
proc ::xml::getElement {tree args} {
lassign [info level 0] recurse
if {$args eq ""} then {
set tree
} else {
switch -exact -- [dict get $tree type] {
pcdata - comment {
set tree
}
element {
set args [lassign $args index]
if {$index < [llength [dict get $tree content]]} then {
$recurse [lindex [dict get $tree content] $index] {*}$args
}
}
default {
return -code error [list unknown type [dict get $tree type]]
}
}
}
}
proc ::xml::findElementById {tree id args} {
lassign [info level 0] recurse
if {[dict get $tree type] eq "element"} then {
set i 0
foreach child [dict get $tree content] {
if {[dict get $child type] eq "element" &&
[dict exists $child attribute id] &&
[dict get $child attribute id] eq $id} then {
return [concat $args $i]
}
set path [$recurse $child $id {*}$args $i]
if {$path ne ""} then {
return $path
}
incr i
}
}
}
proc ::xml::findElementsByNameRoutine {tree names args} {
if {[dict get $tree type] eq "element"} then {
set i 0
foreach child [dict get $tree content] {
if {[dict get $child type] eq "element"} then {
set name [dict get $child name]
if {[llength $names] == 1} then {
lassign $names pattern
if {[string match $pattern $name]} then {
yield [concat $args $i]
}
} else {
if {$name in $names} then {
yield [concat $args $i]
}
}
}
findElementsByNameRoutine $child $names {*}$args $i
incr i
}
}
}
proc ::xml::findElementsByName {tree args} {
# xml findElementsByName $tree td ⇒ {1 0 1} {1 1 1}
# xml findElementsByName $tree {t[dh]} ⇒ {1 0 0} {1 0 1} {1 1 0} {1 1 1}
# xml findElementsByName $tree td th ⇒ {1 0 0} {1 0 1} {1 1 0} {1 1 1}
lassign [info level 0] recurse
set i 1
while {[info commands c$i] ne ""} {
incr i
}
set coroutine c$i
coroutine $coroutine apply [list {tree names} {
yield [info coroutine]
findElementsByNameRoutine $tree $names
} [namespace current]] $tree $args
set result {}
while true {
set path [$coroutine]
if {$path ne ""} then {
lappend result $path
} else {
return $result
}
}
}
proc ::xml::unparse {tree {indent 0} args} {
lassign [info level 0] recurse
if {![string is digit $indent]} then {
lappend args $indent
set indent 0
}
set result ""
if {"-pp" in $args} then {
if {$indent > 0} then {
append result \n
}
append result [string repeat " " $indent]
}
if {[dict get $tree type] eq "comment"} then {
append result <!--[dict get $tree content]-->
} elseif {[dict get $tree type] eq "pcdata"} then {
if {"-pp" in $args} then {
set trimmedTxt [string trim [dict get $tree content]]
if {$trimmedTxt ne ""} then {
append result $trimmedTxt
}
} else {
append result [dict get $tree content]
}
} else {
append result <[dict get $tree name]
foreach {key val} [dict get $tree attribute] {
append result " $key="
if {[string first \u0022 $val] < 0} then {
append result \" $val \"
} else {
append result ' $val '
}
}
if {[llength [dict get $tree content]] == 0} then {
append result " />"
} else {
append result >
foreach child [dict get $tree content] {
append result [$recurse $child [+ $indent 1] {*}$args]
}
if {"-pp" in $args} then {
append result \n[string repeat " " $indent]
}
append result </[dict get $tree name]>
}
}
set result
}
proc ::xml::decode txt {
lappend map "<" < ">" > "&" & """ \"
set matches [lsort -unique [regexp -inline -all {&#[0-9]+;} $txt]]
foreach match $matches {
regexp {([0-9]+)} $match - i
lappend map $match [format %c [scan $i %d]]
}
string map $map $txt
}
proc ::xml::getText {tree args} {
set child [getElement $tree {*}$args]
if {[dict get $child type] eq "pcdata"} then {
decode [dict get $child content]
} elseif {[llength [dict get $child content]] > 0} then {
getText $child 0
}
}
namespace eval xml namespace ensemble create
19.10.2022
<< | Heimatseite | Verzeichnis | Stichworte | Autor | >>