Coverage for gws-app/gws/lib/xmlx/parser.py: 67%
107 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-17 01:37 +0200
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-17 01:37 +0200
1"""XML parser."""
3import re
4import xml.etree.ElementTree
6import gws
8from . import error, element, namespace
11def from_path(
12 path: str,
13 case_insensitive: bool = False,
14 compact_whitespace: bool = False,
15 normalize_namespaces: bool = False,
16 remove_namespaces: bool = False,
17) -> gws.XmlElement:
18 """Creates an ``XmlElement`` object from a .xml file.
20 Args:
21 path: Path to the .xml file.
22 case_insensitive: If true tags will be written in lowercase into the XmlElement.
23 compact_whitespace: If true all whitespaces and newlines are omitted.
24 normalize_namespaces:
25 remove_namespaces: Removes all occurrences of namespaces.
26 """
28 with open(path, 'rb') as fp:
29 inp = fp.read()
30 return _parse(inp, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces)
33def from_string(
34 inp: str | bytes,
35 case_insensitive: bool = False,
36 compact_whitespace: bool = False,
37 remove_namespaces: bool = False,
38 normalize_namespaces: bool = False,
39) -> gws.XmlElement:
40 """Creates an ``XmlElement`` from a string or bytes.
42 Args:
43 inp: .xml file as a string or bytes.
44 case_insensitive: If true tags will be written in lowercase into the XmlElement.
45 compact_whitespace: If true all whitespaces and newlines are omitted.
46 normalize_namespaces:
47 remove_namespaces: Removes all occurrences of namespaces.
48 """
50 return _parse(inp, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces)
53##
56def _parse(inp, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces):
57 inp2 = _decode_input(inp)
58 parser = xml.etree.ElementTree.XMLParser(
59 target=_ParserTarget(case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces))
60 try:
61 parser.feed(inp2)
62 return parser.close()
63 except xml.etree.ElementTree.ParseError as exc:
64 raise error.ParseError(exc.args[0]) from exc
67class _ParserTarget:
68 def __init__(self, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces):
69 self.stack = []
70 self.root = None
71 self.buf = []
72 self.case_insensitive = case_insensitive
73 self.compact_whitespace = compact_whitespace
74 self.remove_namespaces = remove_namespaces
75 self.normalize_namespaces = normalize_namespaces
77 def convert_name(self, s: str) -> str:
78 """"Converts a given XML-namespace or URI to a proper name.
80 Args:
81 s: XML-namespace or URI.
83 Returns:
84 ``{URI}properName``
85 if ``normalize_namespaces`` flag is True, ``{non-versionalized-URL}properName`` is returned,
86 if ``remove_namespaces`` flag is True ``properName`` is returned.
87 """
88 xmlns, uri, pname = namespace.split_name(s)
89 pname = pname.lower() if self.case_insensitive else pname
90 if not xmlns and not uri:
91 return pname
92 if self.remove_namespaces:
93 return pname
94 if self.normalize_namespaces:
95 ns = namespace.find_by_uri(uri)
96 if ns:
97 uri = ns.uri
98 return '{' + uri + '}' + pname
100 def make(self, tag: str, attrib: dict) -> gws.XmlElement:
101 """Creates an Element.
103 Args:
104 tag: The tag.
105 attrib: ``{key:value}``
106 """
107 attrib2 = {}
109 if attrib:
110 for name, val in attrib.items():
111 attrib2[self.convert_name(name)] = val
113 el = element.XmlElementImpl(self.convert_name(tag), attrib2)
114 el.caseInsensitive = self.case_insensitive
116 return el
118 def flush(self):
119 """Loads the buffer into the stack and clears the stack."""
120 if not self.buf:
121 return
123 text = ''.join(self.buf)
124 self.buf = []
126 if self.compact_whitespace:
127 text = ' '.join(text.strip().split())
129 if text:
130 top = self.stack[-1]
131 if len(top) > 0:
132 top[-1].tail = text
133 else:
134 top.text = text
136 ##
138 def start(self, tag: str, attrib: dict):
139 """Flushes the buffer and appends an element to the stack.
141 Args:
142 tag: Tag of the XML-element.
143 attrib: Attribute of the XML-element.
144 """
145 self.flush()
146 el = self.make(tag, attrib)
147 if self.stack:
148 self.stack[-1].append(el)
149 else:
150 self.root = el
151 self.stack.append(el)
153 def end(self, tag):
154 """Flushes the buffer and pops the stack."""
155 self.flush()
156 self.stack.pop()
158 def data(self, data):
159 """Adds data to the buffer.
161 Args:
162 data: data to add."""
163 self.buf.append(data)
165 def close(self):
166 """Returns root."""
167 return self.root
170def _decode_input(inp) -> str:
171 # the problem is, we can receive a document
172 # that is declared ISO-8859-1, but actually is UTF and vice versa.
173 # therefore, don't let expat do the decoding, always give it a `str`
174 # and remove the xml decl with the (possibly incorrect) encoding
176 if isinstance(inp, bytes):
177 inp = inp.strip()
179 encodings = []
181 if inp.startswith(b'<?xml'):
182 try:
183 end = inp.index(b'?>')
184 except ValueError:
185 raise error.ParseError('invalid XML declaration')
187 head = inp[:end].decode('ascii').lower()
188 m = re.search(r'encoding\s*=\s*(\S+)', head)
189 if m:
190 encodings.append(m.group(1).strip('\'\"'))
191 inp = inp[end + 2:]
193 # try the declared encoding, if any, then utf8, then latin
195 if 'utf8' not in encodings:
196 encodings.append('utf8')
197 if 'iso-8859-1' not in encodings:
198 encodings.append('iso-8859-1')
200 for enc in encodings:
201 try:
202 return inp.decode(encoding=enc, errors='strict')
203 except (LookupError, UnicodeDecodeError):
204 pass
206 raise error.ParseError(f'invalid document encoding, tried {",".join(encodings)}')
208 if isinstance(inp, str):
209 inp = inp.strip()
211 if inp.startswith('<?xml'):
212 try:
213 end = inp.index('?>')
214 except ValueError:
215 raise error.ParseError('invalid XML declaration')
216 return inp[end + 2:]
218 return inp
220 raise error.ParseError(f'invalid input')