forked from python-openxml/python-docx
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcxml.py
More file actions
264 lines (219 loc) · 7.63 KB
/
cxml.py
File metadata and controls
264 lines (219 loc) · 7.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# encoding: utf-8
"""
Parser for Compact XML Expression Language (CXEL) ('see-ex-ell'), a compact
XML specification language I made up that's useful for producing XML element
trees suitable for unit testing.
"""
from __future__ import print_function
from pyparsing import (
alphas, alphanums, Combine, dblQuotedString, delimitedList, Forward,
Group, Literal, Optional, removeQuotes, stringEnd, Suppress, Word
)
from docx.oxml import parse_xml
from docx.oxml.ns import nsmap
# ====================================================================
# api functions
# ====================================================================
def element(cxel_str):
"""
Return an oxml element parsed from the XML generated from *cxel_str*.
"""
_xml = xml(cxel_str)
return parse_xml(_xml)
def xml(cxel_str):
"""
Return the XML generated from *cxel_str*.
"""
root_token = root_node.parseString(cxel_str)
xml = root_token.element.xml
return xml
# ====================================================================
# internals
# ====================================================================
def nsdecls(*nspfxs):
"""
Return a string containing a namespace declaration for each of *nspfxs*,
in the order they are specified.
"""
nsdecls = ''
for nspfx in nspfxs:
nsdecls += ' xmlns:%s="%s"' % (nspfx, nsmap[nspfx])
return nsdecls
class Element(object):
"""
Represents an XML element, having a namespace, tagname, attributes, and
may contain either text or children (but not both) or may be empty.
"""
def __init__(self, tagname, attrs, text):
self._tagname = tagname
self._attrs = attrs
self._text = text
self._children = []
self._is_root = False
def __repr__(self):
"""
Provide a more meaningful repr value for an Element object, one that
displays the tagname as a simple empty element, e.g. ``<w:pPr/>``.
"""
return "<%s/>" % self._tagname
def connect_children(self, child_node_list):
"""
Make each of the elements appearing in *child_node_list* a child of
this element.
"""
for node in child_node_list:
child = node.element
self._children.append(child)
@classmethod
def from_token(cls, token):
"""
Return an ``Element`` object constructed from a parser element token.
"""
tagname = token.tagname
attrs = [(name, value) for name, value in token.attr_list]
text = token.text
return cls(tagname, attrs, text)
@property
def is_root(self):
"""
|True| if this element is the root of the tree and should include the
namespace prefixes. |False| otherwise.
"""
return self._is_root
@is_root.setter
def is_root(self, value):
self._is_root = bool(value)
@property
def nspfx(self):
"""
The namespace prefix of this element, the empty string (``''``) if
the tag is in the default namespace.
"""
tagname = self._tagname
idx = tagname.find(':')
if idx == -1:
return ''
return tagname[:idx]
@property
def nspfxs(self):
"""
A sequence containing each of the namespace prefixes appearing in
this tree. Each prefix appears once and only once, and in document
order.
"""
def merge(seq, seq_2):
for item in seq_2:
if item in seq:
continue
seq.append(item)
nspfxs = [self.nspfx]
for child in self._children:
merge(nspfxs, child.nspfxs)
return nspfxs
@property
def xml(self):
"""
The XML corresponding to the tree rooted at this element,
pretty-printed using 2-spaces indentation at each level and with
a trailing '\n'.
"""
return self._xml(indent=0)
def _xml(self, indent):
"""
Return a string containing the XML of this element and all its
children with a starting indent of *indent* spaces.
"""
self._indent_str = ' ' * indent
xml = self._start_tag
for child in self._children:
xml += child._xml(indent+2)
xml += self._end_tag
return xml
@property
def _start_tag(self):
"""
The text of the opening tag of this element, including attributes. If
this is the root element, a namespace declaration for each of the
namespace prefixes that occur in this tree is added in front of any
attributes. If this element contains text, that text follows the
start tag. If not, and this element has no children, an empty tag is
returned. Otherwise, an opening tag is returned, followed by
a newline. The tag is indented by this element's indent value in all
cases.
"""
_nsdecls = nsdecls(*self.nspfxs) if self.is_root else ''
tag = '%s<%s%s' % (self._indent_str, self._tagname, _nsdecls)
for attr in self._attrs:
name, value = attr
tag += ' %s="%s"' % (name, value)
if self._text:
tag += '>%s' % self._text
elif self._children:
tag += '>\n'
else:
tag += '/>\n'
return tag
@property
def _end_tag(self):
"""
The text of the closing tag of this element, if there is one. If the
element contains text, no leading indentation is included.
"""
if self._text:
tag = '</%s>\n' % self._tagname
elif self._children:
tag = '%s</%s>\n' % (self._indent_str, self._tagname)
else:
tag = ''
return tag
# ====================================================================
# parser
# ====================================================================
# parse actions ----------------------------------
def connect_node_children(s, loc, tokens):
node = tokens[0]
node.element.connect_children(node.child_node_list)
def connect_root_node_children(root_node):
root_node.element.connect_children(root_node.child_node_list)
root_node.element.is_root = True
def grammar():
# terminals ----------------------------------
colon = Literal(':')
equal = Suppress('=')
slash = Suppress('/')
open_paren = Suppress('(')
close_paren = Suppress(')')
open_brace = Suppress('{')
close_brace = Suppress('}')
# np:tagName ---------------------------------
nspfx = Word(alphas)
local_name = Word(alphas)
tagname = Combine(nspfx + colon + local_name)
# np:attr_name=attr_val ----------------------
attr_name = Word(alphas + ':')
attr_val = Word(alphanums + ' -.%')
attr_def = Group(attr_name + equal + attr_val)
attr_list = open_brace + delimitedList(attr_def) + close_brace
text = dblQuotedString.setParseAction(removeQuotes)
# w:jc{val=right} ----------------------------
element = (
tagname('tagname')
+ Group(Optional(attr_list))('attr_list')
+ Optional(text, default='')('text')
).setParseAction(Element.from_token)
child_node_list = Forward()
node = Group(
element('element')
+ Group(Optional(slash + child_node_list))('child_node_list')
).setParseAction(connect_node_children)
child_node_list << (
open_paren + delimitedList(node) + close_paren
| node
)
root_node = (
element('element')
+ Group(Optional(slash + child_node_list))('child_node_list')
+ stringEnd
).setParseAction(connect_root_node_children)
return root_node
root_node = grammar()