XML

Das Paket XML bietet einen elementaren XML-Parser. Es verarbeitet auch leere Tags, etwa <hr />, und Einschübe des Typs <![CDATA[...]]>. Per Voreinstellung ignoriert der Parser Leerzeichen; mit Schalter -space yes übernimmt er sie Eins zu Eins.

Das Resultat eines Parser-Laufs ist ein Dictionary mit den Schlüsseln type (element oder pcdata), name, attribute, content.

Beispielsitzung:

% set src {<p class="heiter"> Hi! </p>}
<p class="heiter"> Hi! </p>
% set tree [xml parse $src]
type element name p attribute {class heiter} content {{type pcdata content Hi!}}

Auf die Variable tree wird zugegriffen mit [dict ... $tree ...].

Die Prozedur unparse rekonstruiert den Quelltext:

% xml unparse $tree
<p class="heiter">Hi!</p>

Die Option -pp bewirkt übersichtliche Einrückung:

% xml unparse $tree -pp
<p class="heiter">
  Hi!
</p>
%

Schreibzugriffe auf die Baumstruktur innerhalb content ist derzeit nicht geplant.

Den ersten Funktionstest erfuhr der Parser bei meinem Feed-Reader.


#
# XMl -- minimalistic but working XML parser
# usage: package require XML
# 
# xml parse $src ?-space yes|no? 
# xml findElementsByName $tree $name
# xml findElementById $tree $id
# xml getElement $tree ?n1 ?n2 ...??
# xml getText $tree ?n1 ?n2 ...??
# xml unparse $tree ?-pp?
#
# default: xml parse $src -space no
#

package require Tcl 8.6.1
package provide xml 0.1

namespace eval xml namespace export\
  parse\
  unparse\
  getElement\
  getText\
  findElementById\
  findElementsByName

namespace eval xml namespace import ::tcl::mathop::+

proc ::xml::parse {src args} {
  set option [dict merge {
    -space no
  } {*}[lmap {a b} $args {
    if {$a ni { -space }} then {
      return -code error [list unknown option: $a -- {*}[info level 0]]
    }
    list $a $b
  }]]
  # remove comments
  set src [regsub -all <!--{1,1}?.*--> $src ""]
  # encode <![CDATA[http://www.taz.de//!p4608/]]>
  set cdataMap [concat {*}[lsort -unique [lmap {a b}\
    [regexp -inline -all {<!\[CDATA\[(.*?)\]\]>} $src] {
    list $a [string map {< &lt; & &amp; > &gt;} $b]
  }]]]
  set src [string map $cdataMap $src]
  # list of tag index pairs
  set pairs\
    [regexp -inline -indices -all {</?[[:alnum:]:-]+[^>]*/?>} $src]
  # list of tag strings
  set tagList [lmap pair $pairs {string range $src {*}$pair}]
  # indices of pcdata
  set strIdx [lrange [concat {*}$pairs] 1 end-1]
  # list of pcdata strings
  set strList {}
  foreach {i j} $strIdx {
    lappend strList [string range $src $i+1 $j-1]
  }
  # tokens alternating: tag, pcdata, tag, ...
  set tokens {}
  foreach tag $tagList str $strList {
    lappend tokens $tag
    if {[dict get $option -space]} then {
      if {$str ne ""} then {
        lappend tokens $str
      }
    } else {
      if {![string is space $str]} then {
        lappend tokens [string trim $str]
      }
    }
  }
  # process list
  tokensVarToTree tokens
}

proc ::xml::tokensVarToTree _tokens {
  lassign [info level 0] recurse
  upvar $_tokens tokens
  set openPattern {<([[:alnum:]:-]+)[^>]*>}
  set emptyPattern {<([[:alnum:]:-]+)[^>]*/>}
  set closePattern {</([[:alnum:]:-]+)>}
  set dataPattern {^[^<]}
  # destructive
  set tokens [lassign $tokens token]
  if {[regexp $dataPattern $token]} then {
    # PCDATA
    dict set result type pcdata
    dict set result content $token
    return $result
  } elseif {[regexp $openPattern $token - name]} then {
    # TAG opening or empty
    dict set result type element
    dict set result name $name
    dict set result attribute {}
    dict set result content ""
    # attributes
    foreach {- key val}\
      [regexp -inline -all {([[:alnum:]]+)="([^"]*?)"} $token] {
      dict set result attribute $key $val
    }
    foreach {- key val}\
      [regexp -inline -all {([[:alnum:]]+)='([^']*?)'} $token] {
      dict set result attribute $key $val
    }
  }
  if {[regexp $emptyPattern $token]} then {
    # TAG empty: done
    return $result
  }
  while {![regexp $closePattern [lindex $tokens 0]] &&
         [llength $tokens] > 0} {
    # TAG non-empty: fill contents
    dict lappend result content [$recurse tokens]
  }
  # remove closing TAG
  if {[regexp $closePattern [lindex $tokens 0]]} then {
    set tokens [lrange $tokens 1 end]
  }
  set result
}

proc ::xml::getElement {tree args} {
  lassign [info level 0] recurse
  if {$args eq "" || 
      [dict get $tree type] eq "pcdata"} then {
    set tree
  } else {
    set args [lassign $args index]
    if {$index < [llength [dict get $tree content]]} then {
      $recurse [lindex [dict get $tree content] $index] {*}$args
    }
  }
}

proc ::xml::findElementById {tree id args} {
  lassign [info level 0] recurse
  if {[dict get $tree type] eq "element"} then {
    set i 0  
    foreach child [dict get $tree content] {
      if {[dict get $child type] eq "element" &&
          [dict exists $child attribute id] &&
          [dict get $child attribute id] eq $id} then {
        return [concat $args $i]
      }
      set path [$recurse $child $id {*}$args $i]
      if {$path ne ""} then {
        return $path
      }
      incr i
    }
  }
}

proc ::xml::findElementsByNameRoutine {tree name args} {
  lassign [info level 0] recurse
  if {[dict get $tree type] eq "element"} then {
    set i 0  
    foreach child [dict get $tree content] {
      if {[dict get $child type] eq "element" &&
          [dict get $child name] eq $name} then {
        yield [concat $args $i]
      }
      $recurse $child $name {*}$args $i
      incr i
    }
  }
}

proc ::xml::findElementsByName {tree name} {
  lassign [info level 0] recurse
  set i 1
  while {[info commands c$i] ne ""} {
    incr i
  }
  set coroutine c$i
  coroutine $coroutine apply [list  {tree name} {
      yield [info coroutine]
      findElementsByNameRoutine $tree $name
    } [namespace current]] $tree $name
  set result {}
  while true {
    set path [$coroutine]
    if {$path ne ""} then {
      lappend result $path
    } else {
      return $result
    }
  }
}

proc ::xml::unparse {tree {indent 0} args} {
  lassign [info level 0] recurse
  if {![string is digit $indent]} then {
    lappend args $indent
    set indent 0
  }
  set result ""
  if {"-pp" in $args} then {
    if {$indent > 0} then {
      append result \n
    }
    append result [string repeat "  " $indent]
  }
  if {[dict get $tree type] eq "pcdata"} then {
    if {"-pp" in $args} then {
      append result [string trim [dict get $tree content]]
    } else {
      append result [dict get $tree content]
    }
  } else {
    append result <[dict get $tree name]
    foreach {key val} [dict get $tree attribute] {
      append result " $key="
      if {[string first \u0022 $val] < 0} then {
        append result \" $val \"
      } else {
        append result ' $val '
      }
    }
    if {[llength [dict get $tree content]] == 0} then {
      append result " />"
    } else {
      append result >
      foreach child [dict get $tree content] {
        append result [$recurse $child [+ $indent 1] {*}$args]
      }
      if {"-pp" in $args} then {
        append result \n[string repeat "  " $indent]
      }
      append result </[dict get $tree name]>
    }
  }
  set result
}

proc ::xml::decode txt {
  lappend map "&lt;" < "&gt;" > "&amp;" & "&quot;" \"
  set matches [lsort -unique [regexp -inline -all {&#[0-9]+;} $txt]]
  foreach match $matches {
    regexp {([0-9]+)} $match - i
    lappend map $match [format %c [scan $i %d]]
  }
  string map $map $txt
}

proc ::xml::getText {tree args} {
  set child [getElement $tree {*}$args]
  if {[dict get $child type] eq "pcdata"} then {
    decode [dict get $child content]
  } elseif {[llength [dict get $child content]] > 0} then {
    getText $child 0
  }
}

namespace eval xml namespace ensemble create

(Ursprung – 21.04.2019)

<< Home | Sitemap | A-Z | Impressum | Suche >>