XML

Das Paket XML bietet einen elementaren XML-Parser. Es verarbeitet auch leere Tags, etwa <hr />, und Einschübe des Typs <![CDATA[]]>. Per Voreinstellung ignoriert der Parser Leerzeichen; mit Schalter -space yes übernimmt er sie Eins zu Eins.

Das Resultat eines Parser-Laufs ist ein Dictionary mit den Schlüsseln type (element, pcdata oder comment), name, attribute, content.

Beispielsitzung:

% set src {<p class="heiter"> Hi! </p>}
<p class="heiter"> Hi! </p>
% set tree [xml parse $src]
type element name p attribute {class heiter} content {{type pcdata content Hi!}}

Auf die Variable tree wird zugegriffen mit [dict ... $tree ...].

Die Prozedur unparse rekonstruiert den Quelltext:

% xml unparse $tree
<p class="heiter">Hi!</p>

Die Option -pp bewirkt übersichtliche Einrückung:

% xml unparse $tree -pp
<p class="heiter">
  Hi!
</p>
%

Schreibzugriff auf die Baumstruktur innerhalb content ist derzeit nicht geplant.

Den ersten Funktionstest erfuhr der Parser bei meinem Feed-Reader.


#
# XMl -- minimalistic but working XML parser
# usage: package require XML
# 
# xml parse $src ?-space yes|no? 
# xml findElementsByName $tree $name
# xml findElementById $tree $id
# xml getElement $tree ?n1 ?n2 ...??
# xml getText $tree ?n1 ?n2 ...??
# xml unparse $tree ?-pp?
#
# default: xml parse $src -space no
#

package require Tcl 8.6.1
package provide xml 0.3

namespace eval xml {
  namespace export\
    parse\
    unparse\
    getElement\
    getText\
    findElementById\
    findElementsByName
}

namespace eval xml namespace import ::tcl::mathop::+

proc ::xml::tokenize txt {
  set start 0
  set result {}
  while true {
    set openIdx [string first < $txt $start]
    if {$openIdx < 0} break
    if {[regexp {[[:alpha:]/]} [string index $txt $openIdx+1]]} then {
      set closeIdx [string first > $txt $openIdx]
      lappend result [list $openIdx $closeIdx]
    } elseif {[string range $txt $openIdx $openIdx+3] eq "<!--"} then {
      set closeIdx [string first --> $txt $openIdx]
      lappend result [list $openIdx [+ $closeIdx 2]]
    } else {
      incr start
      continue
    }
    set start $closeIdx
  }
  set result
}

proc ::xml::parse {src args} {
  set option [dict merge {
    -space no
  } {*}[lmap {a b} $args {
    if {$a ni { -space }} then {
      return -code error [list unknown option: $a -- {*}[info level 0]]
    }
    list $a $b
  }]]
  #
  # encode <![CDATA[http://www.taz.de//!p4608/]]>
  set cdataMap [concat {*}[lsort -unique [lmap {a b}\
    [regexp -inline -all {<!\[CDATA\[(.*?)\]\]>} $src] {
    list $a [string map {
      ]\] ]]
      ]\> ]&gt;
      < &lt; 
      & &amp; 
      > &gt; 
    } $b]
  }]]]
  set src [string map $cdataMap $src]
  set pairs [tokenize $src]
  #
  # set pairs [tokenize $src]
  #
  # list of tag strings
  set tagList [lmap pair $pairs {string range $src {*}$pair}]
  # indices of pcdata
  set strIdx [lrange [concat {*}$pairs] 1 end-1]
  # list of pcdata strings
  set strList {}
  foreach {i j} $strIdx {
    lappend strList [string range $src $i+1 $j-1]
  }
  # tokens alternating: tag, pcdata, tag, ...
  set tokens {}
  foreach tag $tagList str $strList {
    lappend tokens $tag
    if {[dict get $option -space]} then {
      if {$str ne ""} then {
        lappend tokens $str
      }
    } else {
      if {![string is space $str]} then {
        lappend tokens [string trim $str]
      }
    }
  }
  while {
    [string match <!--*--> [lindex $tokens 0]] ||
    [string match <\\?*\\?> [lindex $tokens 0]]
  } {
    set tokens [lrange $tokens 1 end]
  }
  # process list
  tokensVarToTree tokens
}


proc ::xml::tokensVarToTree _tokens {
  upvar $_tokens tokens
  set tokens [lassign $tokens token]
  if {[regexp {^<[[:alpha:]][^>]*>$} $token]} then {
    #
    # token is opening TAG 
    #
    regexp {[[:alnum:]]+} $token name
    set result [dict create type element name $name attribute "" content ""]
    set attList\
      [regexp -inline -all {([[:alnum:]]+)="([^"]*)"} $token]
    lappend attList\
      {*}[regexp -inline -all {([[:alnum:]]+)='([^']*)'} $token]
    foreach {match att val} $attList {
      dict set result attribute $att $val
    }
    dict set result content {}
    if {![regexp {/\s*>} $token]} then {
      #
      # non-empty TAG
      #
      while {
        [llength $tokens] > 0 &&
        ![string match </* [lindex $tokens 0]]
      } {
        dict lappend result content [tokensVarToTree tokens]
      }
      set tokens [lrange $tokens 1 end]
    }
    set result
  } elseif {[string match <!--*--> $token]} then {
    #
    # token is COMMENT
    #
    dict create type comment content [string range $token 4 end-3] 
  } else {
    #
    # token is PCDATA
    #
    dict create type pcdata content $token
  }
}

proc ::xml::getElement {tree args} {
  lassign [info level 0] recurse
  if {$args eq ""} then {
    set tree
  } else {
    switch -exact -- [dict get $tree type] {
      pcdata - comment {
        set tree
      }
      element {
        set args [lassign $args index]
        if {$index < [llength [dict get $tree content]]} then {
          $recurse [lindex [dict get $tree content] $index] {*}$args
        }      
      }
      default {
        return -code error [list unknown type [dict get $tree type]]
      }
    }
  }
}

proc ::xml::findElementById {tree id args} {
  lassign [info level 0] recurse
  if {[dict get $tree type] eq "element"} then {
    set i 0  
    foreach child [dict get $tree content] {
      if {[dict get $child type] eq "element" &&
          [dict exists $child attribute id] &&
          [dict get $child attribute id] eq $id} then {
        return [concat $args $i]
      }
      set path [$recurse $child $id {*}$args $i]
      if {$path ne ""} then {
        return $path
      }
      incr i
    }
  }
}

proc ::xml::findElementsByNameRoutine {tree names args} {
  if {[dict get $tree type] eq "element"} then {
    set i 0  
    foreach child [dict get $tree content] {
      if {[dict get $child type] eq "element"} then {
        set name [dict get $child name]
        if {[llength $names] == 1} then {
          lassign $names pattern
          if {[string match $pattern $name]} then {
            yield [concat $args $i]
          }
        } else {
          if {$name in $names} then {
            yield [concat $args $i]
          }
        }
      }
      findElementsByNameRoutine $child $names {*}$args $i
      incr i
    }
  }
}

proc ::xml::findElementsByName {tree args} {
  # xml findElementsByName $tree td ⇒ {1 0 1} {1 1 1}
  # xml findElementsByName $tree {t[dh]} ⇒ {1 0 0} {1 0 1} {1 1 0} {1 1 1}
  # xml findElementsByName $tree td th ⇒ {1 0 0} {1 0 1} {1 1 0} {1 1 1}
  lassign [info level 0] recurse
  set i 1
  while {[info commands c$i] ne ""} {
    incr i
  }
  set coroutine c$i
  coroutine $coroutine apply [list  {tree names} {
      yield [info coroutine]
      findElementsByNameRoutine $tree $names
    } [namespace current]] $tree $args
  set result {}
  while true {
    set path [$coroutine]
    if {$path ne ""} then {
      lappend result $path
    } else {
      return $result
    }
  }
}

proc ::xml::unparse {tree {indent 0} args} {
  lassign [info level 0] recurse
  if {![string is digit $indent]} then {
    lappend args $indent
    set indent 0
  }
  set result ""
  if {"-pp" in $args} then {
    if {$indent > 0} then {
      append result \n
    }
    append result [string repeat "  " $indent]
  }
  if {[dict get $tree type] eq "comment"} then {
    append result <!--[dict get $tree content]-->
  } elseif {[dict get $tree type] eq "pcdata"} then {
    if {"-pp" in $args} then {
      set trimmedTxt [string trim [dict get $tree content]]
      if {$trimmedTxt ne ""} then {
        append result $trimmedTxt
      }
    } else {
      append result [dict get $tree content]
    }
  } else {
    append result <[dict get $tree name]
    foreach {key val} [dict get $tree attribute] {
      append result " $key="
      if {[string first \u0022 $val] < 0} then {
        append result \" $val \"
      } else {
        append result ' $val '
      }
    }
    if {[llength [dict get $tree content]] == 0} then {
      append result " />"
    } else {
      append result >
      foreach child [dict get $tree content] {
        append result [$recurse $child [+ $indent 1] {*}$args]
      }
      if {"-pp" in $args} then {
        append result \n[string repeat "  " $indent]
      }
      append result </[dict get $tree name]>
    }
  }
  set result
}

proc ::xml::decode txt {
  lappend map "&lt;" < "&gt;" > "&amp;" & "&quot;" \"
  set matches [lsort -unique [regexp -inline -all {&#[0-9]+;} $txt]]
  foreach match $matches {
    regexp {([0-9]+)} $match - i
    lappend map $match [format %c [scan $i %d]]
  }
  string map $map $txt
}

proc ::xml::getText {tree args} {
  set child [getElement $tree {*}$args]
  if {[dict get $child type] eq "pcdata"} then {
    decode [dict get $child content]
  } elseif {[llength [dict get $child content]] > 0} then {
    getText $child 0
  }
}

namespace eval xml namespace ensemble create

19.10.2022