Coverage for gws-app/gws/lib/xmlx/parser.py: 67%

1"""XML parser."""

3import re

4import xml.etree.ElementTree

6import gws

8from . import error, element, namespace

11def from_path(

12 path: str,

13 case_insensitive: bool = False,

14 compact_whitespace: bool = False,

15 normalize_namespaces: bool = False,

16 remove_namespaces: bool = False,

17) -> gws.XmlElement:

18 """Creates an ``XmlElement`` object from a .xml file.

20 Args:

21 path: Path to the .xml file.

22 case_insensitive: If true tags will be written in lowercase into the XmlElement.

23 compact_whitespace: If true all whitespaces and newlines are omitted.

24 normalize_namespaces:

25 remove_namespaces: Removes all occurrences of namespaces.

26 """

28 with open(path, 'rb') as fp:

29 inp = fp.read()

30 return _parse(inp, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces)

33def from_string(

34 inp: str | bytes,

35 case_insensitive: bool = False,

36 compact_whitespace: bool = False,

37 remove_namespaces: bool = False,

38 normalize_namespaces: bool = False,

39) -> gws.XmlElement:

40 """Creates an ``XmlElement`` from a string or bytes.

42 Args:

43 inp: .xml file as a string or bytes.

44 case_insensitive: If true tags will be written in lowercase into the XmlElement.

45 compact_whitespace: If true all whitespaces and newlines are omitted.

46 normalize_namespaces:

47 remove_namespaces: Removes all occurrences of namespaces.

48 """

50 return _parse(inp, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces)

53##

56def _parse(inp, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces):

57 inp2 = _decode_input(inp)

58 parser = xml.etree.ElementTree.XMLParser(

59 target=_ParserTarget(case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces))

60 try:

61 parser.feed(inp2)

62 return parser.close()

63 except xml.etree.ElementTree.ParseError as exc:

64 raise error.ParseError(exc.args[0]) from exc

67class _ParserTarget:

68 def __init__(self, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces):

69 self.stack = []

70 self.root = None

71 self.buf = []

72 self.case_insensitive = case_insensitive

73 self.compact_whitespace = compact_whitespace

74 self.remove_namespaces = remove_namespaces

75 self.normalize_namespaces = normalize_namespaces

77 def convert_name(self, s: str) -> str:

78 """"Converts a given XML-namespace or URI to a proper name.

80 Args:

81 s: XML-namespace or URI.

83 Returns:

84 ``{URI}properName``

85 if ``normalize_namespaces`` flag is True, ``{non-versionalized-URL}properName`` is returned,

86 if ``remove_namespaces`` flag is True ``properName`` is returned.

87 """

88 xmlns, uri, pname = namespace.split_name(s)

89 pname = pname.lower() if self.case_insensitive else pname

90 if not xmlns and not uri:

91 return pname

92 if self.remove_namespaces:

93 return pname

94 if self.normalize_namespaces:

95 ns = namespace.find_by_uri(uri)

96 if ns:

97 uri = ns.uri

98 return '{' + uri + '}' + pname

100 def make(self, tag: str, attrib: dict) -> gws.XmlElement:

101 """Creates an Element.

102

103 Args:

104 tag: The tag.

105 attrib: ``{key:value}``

106 """

107 attrib2 = {}

108

109 if attrib:

110 for name, val in attrib.items():

111 attrib2[self.convert_name(name)] = val

112

113 el = element.XmlElementImpl(self.convert_name(tag), attrib2)

114 el.caseInsensitive = self.case_insensitive

115

116 return el

117

118 def flush(self):

119 """Loads the buffer into the stack and clears the stack."""

120 if not self.buf:

121 return

122

123 text = ''.join(self.buf)

124 self.buf = []

125

126 if self.compact_whitespace:

127 text = ' '.join(text.strip().split())

128

129 if text:

130 top = self.stack[-1]

131 if len(top) > 0:

132 top[-1].tail = text

133 else:

134 top.text = text

135

136 ##

137

138 def start(self, tag: str, attrib: dict):

139 """Flushes the buffer and appends an element to the stack.

140

141 Args:

142 tag: Tag of the XML-element.

143 attrib: Attribute of the XML-element.

144 """

145 self.flush()

146 el = self.make(tag, attrib)

147 if self.stack:

148 self.stack[-1].append(el)

149 else:

150 self.root = el

151 self.stack.append(el)

152

153 def end(self, tag):

154 """Flushes the buffer and pops the stack."""

155 self.flush()

156 self.stack.pop()

157

158 def data(self, data):

159 """Adds data to the buffer.

160

161 Args:

162 data: data to add."""

163 self.buf.append(data)

164

165 def close(self):

166 """Returns root."""

167 return self.root

168

169

170def _decode_input(inp) -> str:

171 # the problem is, we can receive a document

172 # that is declared ISO-8859-1, but actually is UTF and vice versa.

173 # therefore, don't let expat do the decoding, always give it a `str`

174 # and remove the xml decl with the (possibly incorrect) encoding

175

176 if isinstance(inp, bytes):

177 inp = inp.strip()

178

179 encodings = []

180

181 if inp.startswith(b'<?xml'):

182 try:

183 end = inp.index(b'?>')

184 except ValueError:

185 raise error.ParseError('invalid XML declaration')

186

187 head = inp[:end].decode('ascii').lower()

188 m = re.search(r'encoding\s*=\s*(\S+)', head)

189 if m:

190 encodings.append(m.group(1).strip('\'\"'))

191 inp = inp[end + 2:]

192

193 # try the declared encoding, if any, then utf8, then latin

194

195 if 'utf8' not in encodings:

196 encodings.append('utf8')

197 if 'iso-8859-1' not in encodings:

198 encodings.append('iso-8859-1')

199

200 for enc in encodings:

201 try:

202 return inp.decode(encoding=enc, errors='strict')

203 except (LookupError, UnicodeDecodeError):

204 pass

205

206 raise error.ParseError(f'invalid document encoding, tried {",".join(encodings)}')

207

208 if isinstance(inp, str):

209 inp = inp.strip()

210

211 if inp.startswith('<?xml'):

212 try:

213 end = inp.index('?>')

214 except ValueError:

215 raise error.ParseError('invalid XML declaration')

216 return inp[end + 2:]

217

218 return inp

219

220 raise error.ParseError(f'invalid input')