1 # xml-parse.tcl --
2 #
3 # XML Parser
4
5
6 #--------------------------------------------------
7 #
8 #Class XMLParser - need this comment for import
9 #
10 #--------------------------------------------------
11
12
13 # $Id: xml-parse.tcl,v 1.4 2004/05/27 01:43:39 aswan Exp $
14 #
15 # Copyright (c) 1997 Australian National University (ANU).
16 #
17 # ANU makes this software and all associated data and documentation
18 # ('Software') available free of charge for non-commercial purposes only. You
19 # may make copies of the Software but you must include all of this notice on
20 # any copy.
21 #
22 # The Software was developed for research purposes and ANU does not warrant
23 # that it is error free or fit for any purpose. ANU disclaims any
24 # liability for all claims, expenses, losses, damages and costs any user may
25 # incur as a result of using, copying or modifying the Software.
26
27 ##########################################
28 #
29 # XML Parser
30 #
31 # This package provides procedures for parsing
32 # XML DTDs and document instances.
33 #
34 # It uses the XML namespace.
35 #
36 ##########################################
37
38 package provide XML 1.0
39
40 namespace eval XML {
41 namespace export parse parse:idx pretty_print pretty_print:idx
42 namespace export parse:DTD
43
44 # XML:parse:idx --
45 #
46 # Arguments:
47 # xml Document instance
48 # args option/value pairs
49 #
50 # Recognised Options:
51 # -textcommand Default parse:text
52 # -elementcommand Default parse:elem
53 # -picommand Default parse:pi
54 # -commentcommand Default None
55 # -dtd DTD to use for document
56 # -loaddtdcommand Script to load and parse DTD
57 # -errorcommand Script to evaluate if error found
58 # -warningcommand Script to evaluate for a reportable warning
59 # -unusedvariable global variable for storing unused text
60 # -statevariable global parser variable
61 # -start Default true
62 # -end Default true
63 #
64 # Return value:
65 # None.
66
67 proc parse:idx {xml args} {
68 array set opts {
69 -start 1 -end 1
70 -removewhitespace 0
71 -textcommand parse:text
72 -elementcommand parse:elem
73 -picommand parse:pi
74 -loaddtdcommand noop
75 -errorcommand noop
76 -warningcommand noop
77 }
78 array set opts [list \
79 -statevariable [namespace current]::parse_state \
80 ]
81 array set opts $args ;# very rudimentary option processing
82 regsub {^(1|yes|true|on)$} $opts(-start) 1 opts(-start)
83 regsub {^(0|no|false|off)$} $opts(-start) 0 opts(-start)
84 regsub {^(1|yes|true|on)$} $opts(-end) 1 opts(-end)
85 regsub {^(0|no|false|off)$} $opts(-end) 0 opts(-end)
86
87 # First, transform the XML into a (flat) Tcl list, ala Uhler
88
89 # Protect Tcl special characters
90 regsub -all {([{}\\])} $xml {\\\1} xml
91
92 # Do the translation
93 set w " \t\r\n" ;# white space
94 set n "\r\n" ;# newline
95 set no_ws [cl ^$w]
96 set nmtoken [cl -a-zA-Z0-9._]+ ;# NMTOKENs
97 set name [cl a-zA-Z_][cl -a-zA-Z0-9._]*
98
99 # steve: check whether XML swallows newlines before and/or after a tag
100 # this version swallows
101 # Answer: don't swallow here, but collapse ws for element-only content
102 set exp <(/?)([cl ^$w>]+)[cl $w]*([cl ^>]*)(/?)>
103 set sub "\}\n{\\2} {\\1} {\\4} {\\3} \{"
104 if {[info exists opts(-unusedvariable)]} {
105 upvar #0 $opts(-unusedvariable) unused
106 if {[info exists unused]} {
107 # Sanity check
108 if {$opts(-start)} {
109 error {specified start of XML, but found unused XML text}
110 }
111 regsub -all $exp "$unused$xml" $sub xml
112 unset unused
113 } else {
114 regsub -all $exp $xml $sub xml
115 }
116 set xml "{} {} {} {} \{$xml\}"
117 if {[regexp {^([^<]*)(<[^>]*$)} [lindex $xml end] x text unused]} {
118 set xml [lreplace $xml end end $text]
119 }
120 } else {
121 regsub -all $exp $xml $sub xml
122 set xml "{} {} {} {} \{$xml\}"
123 }
124
125 dbgputs parse {Flat list:}
126 foreach {tag close empty param text} $xml {
127 dbgputs parse [list $tag $close $empty $param $text]
128 }
129 dbgputs parse {}
130
131 # Now process this flat representation into a heirarchical list.
132 # Start with no context.
133
134 upvar #0 $opts(-statevariable) state
135 if {[info exists opts(-dtd)]} {
136 array set contentmodel [lindex $opts(-dtd) 0]
137 array set attributes [lindex $opts(-dtd) 1]
138 array set entities [lindex $opts(-dtd) 2]
139 } else {
140 # Parse the document without a DTD. All content models assume
141 # ANY, all attributes are allowed and the basic entities are
142 # available.
143 #array set contentmodel {}
144 #array set attributes {}
145 array set entities {amp & lt < gt > quot \" apos '}
146 }
147
148 if {$opts(-start)} {
149 # The document prologue gives us the required DTD and
150 # starting context, from which we can determine the start
151 # content model.
152 catch {unset state}
153 array set state [list mode normal RMD ALL cmodel {} context {} stack $opts(-statevariable)(container0) level 0 cid 0 preserve 0 pre_stack {}]
154 upvar #0 [lindex $state(stack) end] currContainer
155 set currContainer {}
156 } else {
157 upvar #0 [lindex $state(stack) end] currContainer
158 }
159
160 foreach {tag close empty param text} $xml {
161 # If the current mode is cdata or comment then we must undo what the
162 # regsub above has done to reconstitute the data
163 switch $state(mode) {
164 comment {
165 if {![string compare {} $param] && [regexp ([cl ^-]*)--[cl $w]*\$ $tag x comm1]} {
166 # Found the end of the comment
167 dbgputs parse {end of comment (in tag)}
168 catch {append currContainer "\n$opts(-commentcommand) [list "$state(comment_text)<$close$comm1"] {} {}"}
169 set tag {}
170 unset state(comment_text)
171 set state(mode) normal
172 } elseif {[regexp ([cl ^-]*)--[cl $w]*\$ $param x comm1]} {
173 # Found the end of the comment
174 dbgputs parse {end of comment (in attributes)}
175 catch {append currContainer "\n$opts(-commentcommand) [list "$state(comment_text)<$close$tag $comm1"] {} {}"}
176 set tag {}
177 set param {}
178 unset state(comment_text)
179 set state(mode) normal
180 } elseif {[regexp ([cl ^-]*)--[cl $w]*>(.*) $text x comm1 text]} {
181 # Found the end of the comment
182 dbgputs parse {end of comment (in text)}
183 catch {append currContainer "\n$opts(-commentcommand) [list "$state(comment_text)<$close$tag $param>$comm1"] {} {}"}
184 set tag {}
185 set param {}
186 unset state(comment_text)
187 set state(mode) normal
188 } else {
189 dbgputs parse {comment continues}
190 append state(comment_text) <$close$tag { } $param$empty>$text
191 continue
192 }
193 }
194 cdata {
195 if {![string compare {} $param] && [regexp ([cl ^\]]*)\]\][cl $w]*\$ $tag x cdata1]} {
196 # Found the end of the CDATA
197 dbgputs parse {end of CDATA (in tag)}
198 set text $state(cdata)<$close$cdata1$text
199 set tag {}
200 unset state(cdata)
201 set state(mode) normal
202 } elseif {[regexp ([cl ^\]]*)\]\][cl $w]*\$ $param x cdata1]} {
203 # Found the end of the CDATA
204 dbgputs parse {end of CDATA (in attributes)}
205 set text $state(cdata)<$close$tag\ $cdata1$text
206 set tag {}
207 set param {}
208 unset state(cdata)
209 set state(mode) normal
210 } elseif {[regexp ([cl ^\]]*)\]\][cl $w]*>(.*) $text x cdata1 text]} {
211 # Found the end of the CDATA
212 dbgputs parse {end of CDATA (in text)}
213 set text $state(cdata)<$close$tag\ $param$empty>$cdata1$text
214 set tag {}
215 set param {}
216 unset state(cdata)
217 set state(mode) normal
218 } else {
219 dbgputs parse {CDATA continues}
220 append state(cdata) <$close$tag { } $param$empty>$text
221 continue
222 }
223 }
224 markupdecl {
225 if {[regexp [cl $w]*([cl ^$w])[cl $w](.*) $param x id value]} {
226 catch {parse:dtd:[string toupper $tag] $id $value} err
227 continue
228 } else {
229 eval $opts(-errorcommand) [list "bad syntax in internal DTD subset declaration \"<$tag $param>\""]
230 }
231 if {[regexp {]>} $text]} {
232 # Ignore other text, since we're in a PI
233 set tag {}
234 set param {}
235 set text {}
236 set state(mode) normal
237 }
238 }
239 }
240 # default: normal mode
241
242 # Fold tag to uppercase and process attributes
243 set ftag [string toupper $tag]
244 catch {unset params}
245 array set params [parse:attrs $param]
246 # A validator would now check the attributes against the attribute list
247 # We won't bother, but (later) we will search the list for #FIXED attributes
248
249 dbgputs parse [list processing $close$ftag$empty - context [lindex $state(context) end] container [lindex $state(stack) end] cmodel [lindex $state(cmodel) end] level $state(level) containerid $state(cid) pre $state(preserve)]
250
251 switch -glob -- [regexp {^\?|!.*} $ftag],$close,$empty {
252 0,, {
253 # Start tag for a non-empty (container) element.
254
255 parse:ifValid opts $ftag [list parse:element:open opts $ftag params]
256 }
257 0,/, {
258 # End tag for an element.
259
260 # This tag (the end tag) should match the current context (which was the open tag).
261 if {[string compare $ftag [lindex $state(context) end]]} {
262 # Go looking for the matching start tag
263 dbgputs XML_parse [list end tag $ftag doesn't match open container [lindex $state(context) end]]
264 # First check whether the start tag is there at all
265 if {[lsearch $state(context) $ftag] >= 0} {
266 dbgputs XML_parse [list open tag found - closing intervening containers]
267 while {[string compare $ftag [lindex $state(context) end]]} {
268 dbgputs XML_parse [list closing intervening container [lindex $state(context) end]]
269 parse:element:close opts
270 }
271 # Now finally close the container
272 parse:element:close opts
273 eval $opts(-errorcommand) unbalanced [list "unbalanced open tags for close tag \"$ftag\""]
274 } else {
275 # No start tag found - ignore this tag
276 dbgputs XML_parse [list ignoring close tag $ftag - start tag not found]
277 eval $opts(-errorcommand) unmatchedclose [list "ignoring close tag for element \"$ftag\": no matching start tag"]
278 }
279
280 } else {
281 dbgputs XML_parse [list close container $ftag]
282 parse:element:close opts
283 }
284 }
285 0,,/ {
286 # Empty element
287
288 parse:ifValid opts $ftag [list append currContainer "\n$opts(-elemcommand) $ftag [list [array get params]] {}"]
289 }
290 1,, {
291 # Processing instructions or XML declaration
292 switch -glob -- $ftag {
293 \\?XML {
294 # $params(RMD) indicates whether DTD is required or not
295 catch {
296 if {![regexp {(NONE|INTERNAL|ALL)} [string toupper $params(RMD)] x state(RMD)]} {
297 eval $opts(-errorcommand) invalidvalue [list "invalid value \"$params(RMD)\" for Required Markup Declaration"]
298 }
299 }
300 append currContainer "\n$opts(-picommand) $ftag [list [array get params]] {}"
301 }
302 \\?* {
303 append currContainer "\n$opts(-picommand) $ftag [list [array get params]] {}"
304 }
305 !DOCTYPE {
306 # Parse the params supplied. Looking for Name, ExternalID and MarkupDecl
307 regexp ^[cl $w]*($name)(.*) $param x state(doc_name) param
308 set state(doc_name) [string toupper $state(doc_name)]
309 set externalID {}
310 if {[regexp -nocase ^[cl $w]*(SYSTEM|PUBLIC)(.*) $param x id param]} {
311 switch [string toupper $id] {
312 SYSTEM {
313 if {[regexp ^[cl $w]+"([cl ^"]*)"(.*) $param x systemlit param] || [regexp ^[cl $w]+'([cl ^']*)'(.*) $param x systemlit param]} {
314 set externalID [list SYSTEM $systemlit]
315 } else {
316 eval $opts(-errorcommand) syntax [list "SYSTEM identifier not followed by literal"]
317 }
318 }
319 PUBLIC {
320 if {[regexp ^[cl $w]+"([cl ^"]*)"(.*) $param x pubidlit param] || [regexp ^[cl $w]+'([cl ^']*)'(.*) $param x pubidlit param]} {
321 if {[regexp ^[cl $w]+"([cl ^"]*)"(.*) $param x systemlit param] || [regexp ^[cl $w]+'([cl ^']*)'(.*) $param x systemlit param]} {
322 set externalID [list PUBLIC $pubidlit $systemlit]
323 } else {
324 eval $opts(-errorcommand) syntax [list "PUBLIC identifier not followed by system literal"]
325 }
326 } else {
327 eval $opts(-errorcommand) syntax [list "PUBLIC identifier not followed by literal"]
328 }
329 }
330 }
331 if {[regexp -nocase ^[cl $w]+NDATA[cl $w]+($name)(.*) $param x notation param]} {
332 lappend externalID $notation
333 }
334 }
335
336 append currContainer "\n$opts(-picommand) $ftag [list $externalID] {}"
337
338 catch {
339 # If a DTD is supplied on the command line, it overrides any DTD specified by the PI
340 if {![string compare {} $opts(-dtd)] && ![string compare ALL $state(RMD)]} {
341 eval $opts(-loaddtdcommand) $externalID
342 }
343 }
344
345 # Now parse markupdecl
346 if {[regexp [format {^[%s]*\[[^<]*<([^%s]+)[%s]*([^%s]*)[%s]*(.*)} $w $w $w $w $w] $param x decl id value]} {
347 set state(mode) markupdecl
348 catch {parse:dtd:[string toupper $decl] $id $value} err
349 }
350
351 if {[info exists contentmodel]} {
352 set state(cmodel) [list [list $state(doc_name) {}]]
353 dbgputs XML_parse [list !doctype set cmodel to $state(cmodel)]
354 }
355 }
356 !--* {
357 # Start of a comment
358 # See if it ends in the same tag, otherwise change the
359 # parsing mode
360 regexp {!--(.*)} $tag x comm1
361 if {[regexp ([cl ^-]*)--[cl $w]*\$ $param x comm2]} {
362 dbgputs parse {processed comment (end in attributes)}
363 catch {append currContainer "\n$opts(-commentcommand) [list $comm1$comm2] {} {}"}
364 } elseif {[regexp ([cl ^-]*)--[cl $w]*\$ $text x comm2]} {
365 dbgputs parse {processed comment (end in text)}
366 catch {append currContainer "\n$opts(-commentcommand) [list "$comm1 $param>$comm2"] {} {}"}
367 } else {
368 dbgputs parse {start of comment}
369 set state(mode) comment
370 set state(comment_text) "$comm1 $param>$text"
371 continue
372 }
373 }
374 ![CDATA[* {
375 dbgputs parse {start CDATA section}
376 regexp {!\[CDATA\[(.*)} $tag x cdata1
377 if {[regexp {([^\]]*)]]$} $param x cdata2]} {
378 dbgputs parse {processed CDATA (end in attribute)}
379 set text "$cdata1 $cdata2$text"
380 } elseif {[regexp {([^\]]*)]]>(.*)} $text x cdata2 text]} {
381 dbgputs parse {processed CDATA (end in text)}
382 set text "$cdata1 $param>$cdata2$text"
383 } else {
384 dbgputs parse {start CDATA}
385 set state(cdata) "$cdata1 $param$text"
386 set state(mode) cdata
387 continue
388 }
389 }
390
391
392 !ELEMENT {
393 # Internal DTD declaration
394 }
395 !ATTLIST {
396 }
397 !ENTITY {
398 }
399 !NOTATION {
400 }
401
402
403 !* {
404 append currContainer "\n$opts(-picommand) $ftag [list $param] {}"
405 }
406 default {
407 eval $opts(-errorcommand) unknown [list "unknown processing instruction \"<$ftag>\""]
408 }
409 }
410 }
411 1,* -
412 0,/,/ {
413 # Syntax error
414 eval $opts(-errorcommand) syntax [list "syntax error: tag \"</$ftag/>\""]
415 }
416 }
417
418 # Add the text to the current container
419 # Must check that #PCDATA is in the content model
420
421 # XML states that white space is removed for element-only content,
422 # but is forwarded to the application in mixed content.
423 # The attribute XML-SPACE="PRESERVE" can be used to force white space to
424 # be kept. However, applications may want white space collapsed wherever
425 # possible. -removewhitespace may be used to indicate this behaviour.
426
427 set text [expr {($state(preserve) || ![info exists contentmodel]) || !$opts(-removewhitespace) ? [entity $text] : [zap_white [entity $text]]}]
428 switch -glob [info exists contentmodel([lindex $state(context) end])],[string compare 0 [regexp $no_ws $text]] {
429 1,-1 {
430 dbgputs XML_parse [list processing #PCDATA - current content model [lindex $state(cmodel) end]]
431 # Is #PCDATA valid for the current content model?
432 parse:ifValid opts #PCDATA [list append currContainer "\n$opts(-textcommand) [list $text] {} {}"]
433 }
434 0,-1 {
435 # No content model for this element, assume ANY
436 dbgputs XML_parse [list no content model for current element, assuming ANY]
437 append currContainer "\n$opts(-textcommand) [list $text] {} {}"
438 }
439 }
440 }
441
442 # If this is the end of the document, close all open containers
443 if {$opts(-end)} {
444 while {$state(level) > 0} {
445 dbgputs XML_parse [list at end - popping stack]
446 parse:element:close opts
447 }
448 }
449
450 return {}
451 }
452
453 # Support routines
454
455 proc cl x {return "\[$x\]"}
456 set w " \t\r\n" ;# white space
457
458 proc parse:ifValid {optvar content validcmd} {
459 upvar $optvar opts
460 upvar #0 $opts(-statevariable) state
461 upvar contentmodel cm
462
463 # Is this element valid for the current content model?
464 if {![info exists cm] && [string compare {} $content]} {
465 # There is no content model.
466 uplevel 1 $validcmd
467 } elseif {![string compare * [lindex [lindex $state(cmodel) end] 0]] || [lsearch [lindex [lindex $state(cmodel) end] 0] $content] >= 0} {
468 dbgputs XML_parse [list content $content is OK]
469 # Modify the content model according to the current rep
470 switch [lindex [lindex $state(cmodel) end] 1] {
471 {} -
472 ? {
473 # remove this model from the current content model
474 set state(cmodel) [lreplace $state(cmodel) end end [lreplace [lindex $state(cmodel) end] 0 1]]
475 }
476 * -
477 + {# no change necessary}
478 }
479
480 # Go ahead and run script
481 uplevel 1 $validcmd
482
483 } else {
484 dbgputs XML_parse [list element $content not in content model]
485 # There are four options:
486 # - as reps of content models allow, search ahead for a context where this element is valid, and modify content model appropriately
487 # - search ahead through the content model and imply intervening elements
488 # - close elements until an element lower in the stack allows this element
489 # - ignore this element (a "hard" error)
490 }
491 }
492
493 proc parse:element:open {optvar tag param} {
494 upvar $optvar opts
495 upvar #0 $opts(-statevariable) state
496 upvar contentmodel cm
497 upvar currContainer current
498 upvar $param params
499
500 dbgputs XML_parse [list parse:element:open current: level $state(level) cid $state(cid) cmodel $state(cmodel)]
501 incr state(level) ;# invariant: $state(level) == [llength $state(stack)]
502 set container $opts(-statevariable)(container[incr state(cid)])
503 append current "\n$opts(-elementcommand) $tag [list [array get params]] $container"
504 lappend state(stack) $container
505 uplevel "upvar #0 $container currContainer; set currContainer {}"
506 lappend state(context) $tag
507 catch {lappend state(cmodel) $cm($tag)}
508 dbgputs XML_parse [list content model is now $state(cmodel)]
509
510 # Need to keep a reference count for handling nested preformatted elements
511 if {[catch {
512 if {![string compare PRESERVE $params(XML-SPACE)]} {
513 dbgputs XML_parse {starting preformatted text}
514 incr state(preserve)
515 lappend state(pre_stack) 1
516 } else {
517 lappend state(pre_stack) 0
518 }
519 }]} {
520 lappend state(pre_stack) 0
521 }
522
523 return $container
524 }
525
526 proc parse:element:close optvar {
527 upvar $optvar opts
528 upvar #0 $opts(-statevariable) state
529
530 # Decrement preformatted reference count appropriately
531 incr state(preserve) -[lindex $state(pre_stack) end]
532 set state(pre_stack) [lreplace $state(pre_stack) end end] ;# pop stack
533
534 incr state(level) -1
535 set state(stack) [lreplace $state(stack) end end]
536 set state(context) [lreplace $state(context) end end]
537 catch {set state(cmodel) [lreplace $state(cmodel) end end]}
538 uplevel "upvar #0 [lindex $state(stack) end] currContainer"
539 dbgputs XML_parse [list parse:element:close $optvar - new: level $state(level) stack $state(stack) pre $state(preserve)]
540 }
541
542 # parse:attrs --
543 #
544 # Parse attributes. There are two forms for name-value pairs:
545 #
546 # name="value"
547 # name='value'
548 #
549 # Arguments:
550 # attrs attribute string given in a tag
551 #
552 # Return Value:
553 # A Tcl list representing the name-value pairs in the attribute string
554
555 proc parse:attrs {attrs} {
556 # First check whether there's any work to do
557 if {![string compare {} [string trim $attrs]]} {
558 return {}
559 }
560
561 # Protect Tcl special characters
562 regsub -all {([[\$\\])} $attrs {\\\1} attrs
563
564 set ws "\n\t "
565 regsub -all [format {([a-zA-Z0-9]+)[%s]*=[%s]*"([^"]*)"} $ws $ws] $attrs {[set parsed([string toupper {\1}]) {\2}] } attrs ;# "
566 regsub -all [format {([a-zA-Z0-9]+)[%s]*=[%s]*'([^']*)'} $ws $ws] $attrs {[set parsed([string toupper {\1}]) {\2}] } attrs
567 subst $attrs
568
569 return [array get parsed]
570 }
571
572 # parse --
573 #
574 # Parse XML text into a Tcl heirarchical list format.
575 # First parses into indexed list format, and then expands the
576 # indexed structures to arrive at final format.
577
578 proc parse {xml args} {
579 array set opts [list \
580 -statevariable [namespace current]::parse_state \
581 -textcommand parse:text \
582 -elemcommand parse:elem \
583 -picommand parse:pi \
584 -commentcommand parse:comment \
585 ]
586 array set opts $args
587
588 eval parse:idx [list $xml] $args
589
590 return [parse:expand $opts(-statevariable) $opts(-elemcommand) $opts(-textcommand) $opts(-picommand) $opts(-commentcommand)]
591 }
592
593 proc parse:expand {state elemcmd txtcmd picmd commentcmd} {
594 upvar #0 $state data
595 return [parse:expand_int ${state}(container0) $elemcmd $txtcmd $picmd $commentcmd]
596 }
597
598 proc parse:expand_int {container elemcmd txtcmd picmd commentcmd} {
599 upvar #0 $container data
600 if {![info exists data]} return;
601 set ret {}
602 foreach {type arg1 arg2 arg3} $data {
603 switch $type \
604 $elemcmd {
605 lappend ret $type $arg1 $arg2 [parse:expand_int $arg3 $elemcmd $txtcmd $picmd $commentcmd]
606 } \
607 $txtcmd - \
608 $picmd - \
609 $commentcmd {
610 lappend ret $type $arg1 $arg2 $arg3
611 } \
612 default {error [list $type doesn't match $elemcmd or $txtcmd]}
613 }
614 return $ret
615 }
616
617 # Print out a nice representation of a parsed XML structure
618
619 proc pretty_print {xml {indent {}} {elemcmd parse:elem} {txtcmd parse:text} {picmd parse:pi} {commentcmd parse:comment}} {
620 set ret {}
621 foreach {type arg1 arg2 arg3} $xml {
622 #puts "$indent"
623 switch $type \
624 $picmd - \
625 $commentcmd - \
626 $txtcmd {
627 if {[string compare {} $indent]} {
628 append ret [format %${indent}s { }]
629 }
630 append ret [list $arg1]\n
631 } \
632 $elemcmd {
633 if {[string compare {} $indent]} {
634 append ret [format %${indent}s { }]
635 } else {
636 append indent 0
637 }
638 append ret "[list $arg1] [list $arg2] \{\n"
639 append ret [pretty_print $arg3 [expr $indent + 4] $elemcmd $txtcmd $picmd $commentcmd]
640 if {$indent > 0} {
641 append ret [format %${indent}s { }]
642 }
643 append ret \}\n
644 }
645 }
646 return $ret
647 }
648
649 # Similar to above, but work from state variable with container pointers
650
651 proc pretty_print:idx {{container parse_state(container0)} {indent {}} {elemcmd parse:elem} {txtcmd parse:text} {picmd parse:pi} {commentcmd parse:comment}} {
652 upvar #0 $container xml
653
654 if {![info exists xml]} {
655 if {[string compare {} $indent]} {
656 append ret [format %${indent}s { }]
657 }
658 return {}
659 }
660
661 set ret {}
662 foreach {type arg1 arg2 arg3} $xml {
663 switch $type \
664 $picmd - \
665 $commentcmd - \
666 $txtcmd {
667 if {[string compare {} $indent]} {
668 append ret [format %${indent}s { }]
669 }
670 append ret [list $arg1]\n
671 } \
672 $elemcmd {
673 if {[string compare {} $indent]} {
674 append ret [format %${indent}s { }]
675 } else {
676 append indent 0
677 }
678 append ret "[list $arg1] [list $arg2] $arg3 \{\n"
679 append ret [pretty_print:idx $arg3 [expr $indent + 4] $elemcmd $txtcmd $picmd $commentcmd]
680 if {$indent > 0} {
681 append ret [format %${indent}s { }]
682 }
683 append ret \}\n
684 }
685 }
686 return $ret
687 }
688
689 # Debugging. See Utilities package for how to enable.
690
691 proc dbgputs args {}
692
693 # Do-nothing proc
694
695 proc noop args {}
696
697 ### Following procedures are based on html_library
698
699 # Convert multiple white space into a single space
700
701 proc zap_white data {
702 regsub -all "\[ \t\r\n\]+" $data { } data
703 return $data
704 }
705
706 # find XML entity references (syntax: &xxx;)
707
708 proc entity [list text [list entities [namespace current]::entity_predef]] {
709 if {![regexp & $text]} {return $text}
710 regsub -all {([][$\\])} $text {\\\1} new
711 regsub -all {&#(x?)([0-9]+);} \
712 $new {[format %c [scan \2 %[expr {\1 == {} ? d : x}] tmp; set tmp]]} new
713 regsub -all {&([a-zA-Z]+);} $new [format {[entity:deref %s \1]} $entities] new
714 return [subst $new]
715 }
716
717 # convert an XML escape sequence into character
718
719 proc entity:deref [list text [list entities [namespace current]::entity_predef] {unknown ?}] {
720 upvar #0 $entities map
721 set result $unknown
722 catch {set result $map($text)}
723 return $result
724 }
725
726 # table of predefined entities
727
728 array set entity_predef {
729 lt < gt > amp & quot \" apos '
730 }
731
732 ####################################
733 #
734 # DTD parser for XML
735 #
736 # A DTD is represented as a three element Tcl list.
737 # The first element contains the content models for elements,
738 # the second contains the attribute lists for elements and
739 # the last element contains the entities for the document.
740 #
741 ####################################
742
743 proc parse:DTD {dtd args} {
744 array set opts {
745 -errorcommand noop
746 }
747 array set opts $args
748
749 set w " \t\r\n" ;# white space
750 set exp <!([cl ^$w>]+)[cl $w]*([cl ^$w]+)[cl $w]*([cl ^>]*)>
751 set sub {{\1} {\2} {\3}}
752 regsub -all $exp $dtd $sub dtd
753
754 foreach {decl id value} $dtd {
755 catch {parse:dtd:[string toupper $decl] $id $value} err
756 }
757
758 return [list [array get contentmodel] [array get attributes] [array get entities]]
759 }
760
761 # Procedures for handling the various declarative elements in a DTD
762 # New elements may be added by creating a procedure of the form
763 # parse:dtd:_element_
764
765 # For each of these procedures, the various regular expressions they use
766 # are created outside of the proc to avoid overhead at runtime
767
768 proc parse:dtd:element {id value} {
769 dbgputs XML_DTD_parse [list parse:dtd:element $id $value]
770 upvar opts state
771 upvar contentmodel cm
772 upvar otherModels om
773
774 if {[info exists cm($id)] || [info exists im($id)] || [info exists om($id)]} {
775 eval $state(-errorcommand) element [list "element \"$id\" already declared"]
776 } else {
777 switch -- $value {
778 EMPTY {
779 set om($id) {}
780 }
781 ANY {
782 set cm($id) *
783 }
784 default {
785 # Translate the content model into Tcl list format
786 regsub -all {\(} $value " \{ " value
787 regsub -all {\)} $value " \} " value
788 regsub -all {([*,+|?])} $value { \1 } value
789 if {[catch {parse:dtd:element:cmodel $value 2} result]} {
790 eval $state(-errorcommand) element [list $result]
791 } else {
792 set cm($id) $result
793 }
794 }
795 }
796 }
797 }
798
799 # parse:dtd:element:cmodel --
800 #
801 # Parse an element content model.
802 # Content models are sequences of choices, possibly nested.
803 # If the model starts with a choice, then it is a single sequence
804 # at the top-level.
805 #
806 # This is going to need alot of work!
807 #
808 # Return Result:
809 # A Tcl list representing the content model:
810 # {elements1} rep {elements2} rep ...
811 # Eg: (head, (p|list|note)*, div*) becomes:
812 # head {} {p list note} * div *
813
814 proc parse:dtd:element:cmodel {value depth {nested 0}} {
815 upvar $depth entities ents
816 dbgputs XML_DTD_parse [list parse:dtd:element:cmodel $value $depth]
817 set model {}
818 set result {}
819 set rep 0
820
821 foreach part $value {
822 if {[llength $part] > 1} {
823 set result [parse:dtd:element:cmodel $part [expr $depth + 1] 1]
824 if {[llength $result] == 1} {
825 lappend model [lindex $result 0]
826 } else {
827 dbgputs XML_DTD_parse [list appending $result]
828 eval lappend model $result
829 }
830 set result {}
831 set rep 2 ;# 2 indicates that next rep should replace rep returned
832 } else {
833 switch -regexp -- $part {
834 %[^;]*; {
835 # Substitute parameter entity
836 regexp {%([^;]*);} $part x pentity
837 if {[catch {lappend result $ents($pentity)}]} {
838 error "unknown parameter entity \"$pentity\""
839 }
840 }
841 , {
842 if {[llength $result]} {
843 lappend model $result
844 }
845 set result {}
846 if {!$rep} {
847 lappend model {}
848 }
849 set rep 0
850 }
851 \\| {
852 }
853 \\+ -
854 \\* -
855 \\? {
856 if {[llength $result]} {
857 lappend model $result
858 set result {}
859 }
860 if {$rep == 2} {
861 set model [lreplace $model end end $part]
862 } else {
863 lappend model $part
864 }
865 set rep 1
866 }
867 default {
868 lappend result [lindex $part 0]
869 }
870 }
871 }
872 }
873
874 if {[llength $result]} {
875 lappend model $result
876 }
877 if {!$rep} {
878 lappend model {}
879 }
880
881 return $model
882 }
883
884 # Watch out for case-sensitivity
885
886 set attlist_exp [cl $w]*([cl ^$w]+)[cl $w]*([cl ^$w]+)[cl $w]*(#REQUIRED|#IMPLIED)
887 set attlist_enum_exp [cl $w]*([cl ^$w]+)[cl $w]*\\(([cl ^)]*)\\)[cl $w]*("([cl ^")])")? ;# "
888 set attlist_fixed_exp [cl $w]*([cl ^$w]+)[cl $w]*([cl ^$w]+)[cl $w]*(#FIXED)[cl $w]*([cl ^$w]+)
889
890 proc parse:dtd:attlist {id value} {
891 variable attlist_exp attlist_enum_exp attlist_fixed_exp
892 dbgputs XML_DTD_parse [list parse:dtd:attlist $id $value]
893 upvar opts state
894 upvar attributes am
895
896 if {[info exists am($id)]} {
897 eval $state(-errorcommand) attlist [list "attribute list for element \"$id\" already declared"]
898 } else {
899 # Parse the attribute list. If it were regular, could just use foreach,
900 # but some attributes may have values.
901 regsub -all {([][$\\])} $value {\\\1} value
902 regsub -all $attlist_exp $value {[parse:dtd:attlist:att {\1} {\2} {\3}]} value
903 regsub -all $attlist_enum_exp $value {[parse:dtd:attlist:att {\1} {\2} {\3}]} value
904 regsub -all $attlist_fixed_exp $value {[parse:dtd:attlist:att {\1} {\2} {\3} {\4}]} value
905 subst $value
906 set am($id) [array get attlist]
907 }
908 }
909
910 proc parse:dtd:attlist:att {name type default {value {}}} {
911 upvar attlist al
912 # This needs further work
913 set al($name) [list $default $value]
914 }
915
916 set param_entity_exp [cl $w]*([cl ^$w]+)[cl $w]*([cl ^"$w]*)[cl $w]*"([cl ^"]*)"
917
918 proc parse:dtd:entity {id value} {
919 variable param_entity_exp
920 dbgputs XML_DTD_parse [list parse:dtd:entity $id $value]
921 upvar opts state
922 upvar entities ents
923
924 if {[string compare % $id]} {
925 # Entity declaration
926 if {[info exists ents($id)]} {
927 eval $state(-errorcommand) entity [list "entity \"$id\" already declared"]
928 } else {
929 if {![regexp {"([^"]*)"} $value x entvalue] && ![regexp {'([^']*)'} $value x entvalue]} {
930 eval $state(-errorcommand) entityvalue [list "entity value \"$value\" not correctly specified"]
931 } ;# "
932 set ents($id) $entvalue
933 }
934 } else {
935 # Parameter entity declaration
936 switch -glob [regexp $param_entity_exp $value x name scheme data],[string compare {} $scheme] {
937 0,* {
938 eval $state(-errorcommand) entityvalue [list "parameter entity \"$value\" not correctly specified"]
939 }
940 *,0 {
941 # SYSTEM or PUBLIC declaration
942 }
943 default {
944 set ents($id) $data
945 }
946 }
947 }
948 }
949
950 set notation_exp [cl $w]*([cl ^$w]+)[cl $w]*(.*)
951
952 proc parse:dtd:notation {id value} {
953 variable notation_exp
954 upvar opts state
955
956 if {[regexp $notation_exp $value x scheme data] == 2} {
957 } else {
958 eval $state(-errorcommand) notationvalue [list "notation value \"$value\" incorrectly specified"]
959 }
960 }
961
962 # End XML namespace
963 }
964
965
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.