XML
Das Paket XML
bietet einen elementaren XML-Parser. Es verarbeitet auch leere Tags, etwa <hr />
, und Einschübe des Typs <![CDATA[
… ]]>
. Per Voreinstellung ignoriert der Parser Leerzeichen; mit Schalter -space yes
übernimmt er sie Eins zu Eins.
Das Resultat eines Parser-Laufs ist ein Dictionary mit den Schlüsseln type
(element
, pcdata
oder comment
), name
, attribute
, content
.
Beispielsitzung:
% set src {<p class="heiter"> Hi! </p>} <p class="heiter"> Hi! </p> % set tree [xml parse $src] type element name p attribute {class heiter} content {{type pcdata content Hi!}}
Auf die Variable tree
wird zugegriffen mit [dict
... $tree
...]
.
Die Prozedur unparse
rekonstruiert den Quelltext:
% xml unparse $tree <p class="heiter">Hi!</p>
Die Option -pp
bewirkt übersichtliche Einrückung:
% xml unparse $tree -pp <p class="heiter"> Hi! </p> %
Schreibzugriff auf die Baumstruktur innerhalb content
ist derzeit nicht geplant.
Den ersten Funktionstest erfuhr der Parser bei meinem Feed-Reader.
# # XMl -- minimalistic but working XML parser # usage: package require XML # # xml parse $src ?-space yes|no? # xml findElementsByName $tree $name # xml findElementById $tree $id # xml getElement $tree ?n1 ?n2 ...?? # xml getText $tree ?n1 ?n2 ...?? # xml unparse $tree ?-pp? # # default: xml parse $src -space no # package require Tcl 8.6.1 package provide xml 0.3 namespace eval xml { namespace export\ parse\ unparse\ getElement\ getText\ findElementById\ findElementsByName } namespace eval xml namespace import ::tcl::mathop::+ proc ::xml::tokenize txt { set start 0 set result {} while true { set openIdx [string first < $txt $start] if {$openIdx < 0} break if {[regexp {[[:alpha:]/]} [string index $txt $openIdx+1]]} then { set closeIdx [string first > $txt $openIdx] lappend result [list $openIdx $closeIdx] } elseif {[string range $txt $openIdx $openIdx+3] eq "<!--"} then { set closeIdx [string first --> $txt $openIdx] lappend result [list $openIdx [+ $closeIdx 2]] } else { incr start continue } set start $closeIdx } set result } proc ::xml::parse {src args} { set option [dict merge { -space no } {*}[lmap {a b} $args { if {$a ni { -space }} then { return -code error [list unknown option: $a -- {*}[info level 0]] } list $a $b }]] # # encode <![CDATA[http://www.taz.de//!p4608/]]> set cdataMap [concat {*}[lsort -unique [lmap {a b}\ [regexp -inline -all {<!\[CDATA\[(.*?)\]\]>} $src] { list $a [string map { ]\] ]] ]\> ]> < < & & > > } $b] }]]] set src [string map $cdataMap $src] set pairs [tokenize $src] # # set pairs [tokenize $src] # # list of tag strings set tagList [lmap pair $pairs {string range $src {*}$pair}] # indices of pcdata set strIdx [lrange [concat {*}$pairs] 1 end-1] # list of pcdata strings set strList {} foreach {i j} $strIdx { lappend strList [string range $src $i+1 $j-1] } # tokens alternating: tag, pcdata, tag, ... set tokens {} foreach tag $tagList str $strList { lappend tokens $tag if {[dict get $option -space]} then { if {$str ne ""} then { lappend tokens $str } } else { if {![string is space $str]} then { lappend tokens [string trim $str] } } } while { [string match <!--*--> [lindex $tokens 0]] || [string match <\\?*\\?> [lindex $tokens 0]] } { set tokens [lrange $tokens 1 end] } # process list tokensVarToTree tokens } proc ::xml::tokensVarToTree _tokens { upvar $_tokens tokens set tokens [lassign $tokens token] if {[regexp {^<[[:alpha:]][^>]*>$} $token]} then { # # token is opening TAG # regexp {[[:alnum:]]+} $token name set result [dict create type element name $name attribute "" content ""] set attList\ [regexp -inline -all {([[:alnum:]]+)="([^"]*)"} $token] lappend attList\ {*}[regexp -inline -all {([[:alnum:]]+)='([^']*)'} $token] foreach {match att val} $attList { dict set result attribute $att $val } dict set result content {} if {![regexp {/\s*>} $token]} then { # # non-empty TAG # while { [llength $tokens] > 0 && ![string match </* [lindex $tokens 0]] } { dict lappend result content [tokensVarToTree tokens] } set tokens [lrange $tokens 1 end] } set result } elseif {[string match <!--*--> $token]} then { # # token is COMMENT # dict create type comment content [string range $token 4 end-3] } else { # # token is PCDATA # dict create type pcdata content $token } } proc ::xml::getElement {tree args} { lassign [info level 0] recurse if {$args eq ""} then { set tree } else { switch -exact -- [dict get $tree type] { pcdata - comment { set tree } element { set args [lassign $args index] if {$index < [llength [dict get $tree content]]} then { $recurse [lindex [dict get $tree content] $index] {*}$args } } default { return -code error [list unknown type [dict get $tree type]] } } } } proc ::xml::findElementById {tree id args} { lassign [info level 0] recurse if {[dict get $tree type] eq "element"} then { set i 0 foreach child [dict get $tree content] { if {[dict get $child type] eq "element" && [dict exists $child attribute id] && [dict get $child attribute id] eq $id} then { return [concat $args $i] } set path [$recurse $child $id {*}$args $i] if {$path ne ""} then { return $path } incr i } } } proc ::xml::findElementsByNameRoutine {tree names args} { if {[dict get $tree type] eq "element"} then { set i 0 foreach child [dict get $tree content] { if {[dict get $child type] eq "element"} then { set name [dict get $child name] if {[llength $names] == 1} then { lassign $names pattern if {[string match $pattern $name]} then { yield [concat $args $i] } } else { if {$name in $names} then { yield [concat $args $i] } } } findElementsByNameRoutine $child $names {*}$args $i incr i } } } proc ::xml::findElementsByName {tree args} { # xml findElementsByName $tree td ⇒ {1 0 1} {1 1 1} # xml findElementsByName $tree {t[dh]} ⇒ {1 0 0} {1 0 1} {1 1 0} {1 1 1} # xml findElementsByName $tree td th ⇒ {1 0 0} {1 0 1} {1 1 0} {1 1 1} lassign [info level 0] recurse set i 1 while {[info commands c$i] ne ""} { incr i } set coroutine c$i coroutine $coroutine apply [list {tree names} { yield [info coroutine] findElementsByNameRoutine $tree $names } [namespace current]] $tree $args set result {} while true { set path [$coroutine] if {$path ne ""} then { lappend result $path } else { return $result } } } proc ::xml::unparse {tree {indent 0} args} { lassign [info level 0] recurse if {![string is digit $indent]} then { lappend args $indent set indent 0 } set result "" if {"-pp" in $args} then { if {$indent > 0} then { append result \n } append result [string repeat " " $indent] } if {[dict get $tree type] eq "comment"} then { append result <!--[dict get $tree content]--> } elseif {[dict get $tree type] eq "pcdata"} then { if {"-pp" in $args} then { set trimmedTxt [string trim [dict get $tree content]] if {$trimmedTxt ne ""} then { append result $trimmedTxt } } else { append result [dict get $tree content] } } else { append result <[dict get $tree name] foreach {key val} [dict get $tree attribute] { append result " $key=" if {[string first \u0022 $val] < 0} then { append result \" $val \" } else { append result ' $val ' } } if {[llength [dict get $tree content]] == 0} then { append result " />" } else { append result > foreach child [dict get $tree content] { append result [$recurse $child [+ $indent 1] {*}$args] } if {"-pp" in $args} then { append result \n[string repeat " " $indent] } append result </[dict get $tree name]> } } set result } proc ::xml::decode txt { lappend map "<" < ">" > "&" & """ \" set matches [lsort -unique [regexp -inline -all {&#[0-9]+;} $txt]] foreach match $matches { regexp {([0-9]+)} $match - i lappend map $match [format %c [scan $i %d]] } string map $map $txt } proc ::xml::getText {tree args} { set child [getElement $tree {*}$args] if {[dict get $child type] eq "pcdata"} then { decode [dict get $child content] } elseif {[llength [dict get $child content]] > 0} then { getText $child 0 } } namespace eval xml namespace ensemble create
19.10.2022
<< | Heimatseite | Verzeichnis | Stichworte | Autor | >>