Coverage for gws-app/gws/lib/xmlx/parser.py: 67%

107 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-17 01:37 +0200

1"""XML parser.""" 

2 

3import re 

4import xml.etree.ElementTree 

5 

6import gws 

7 

8from . import error, element, namespace 

9 

10 

11def from_path( 

12 path: str, 

13 case_insensitive: bool = False, 

14 compact_whitespace: bool = False, 

15 normalize_namespaces: bool = False, 

16 remove_namespaces: bool = False, 

17) -> gws.XmlElement: 

18 """Creates an ``XmlElement`` object from a .xml file. 

19 

20 Args: 

21 path: Path to the .xml file. 

22 case_insensitive: If true tags will be written in lowercase into the XmlElement. 

23 compact_whitespace: If true all whitespaces and newlines are omitted. 

24 normalize_namespaces: 

25 remove_namespaces: Removes all occurrences of namespaces. 

26 """ 

27 

28 with open(path, 'rb') as fp: 

29 inp = fp.read() 

30 return _parse(inp, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces) 

31 

32 

33def from_string( 

34 inp: str | bytes, 

35 case_insensitive: bool = False, 

36 compact_whitespace: bool = False, 

37 remove_namespaces: bool = False, 

38 normalize_namespaces: bool = False, 

39) -> gws.XmlElement: 

40 """Creates an ``XmlElement`` from a string or bytes. 

41 

42 Args: 

43 inp: .xml file as a string or bytes. 

44 case_insensitive: If true tags will be written in lowercase into the XmlElement. 

45 compact_whitespace: If true all whitespaces and newlines are omitted. 

46 normalize_namespaces: 

47 remove_namespaces: Removes all occurrences of namespaces. 

48 """ 

49 

50 return _parse(inp, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces) 

51 

52 

53## 

54 

55 

56def _parse(inp, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces): 

57 inp2 = _decode_input(inp) 

58 parser = xml.etree.ElementTree.XMLParser( 

59 target=_ParserTarget(case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces)) 

60 try: 

61 parser.feed(inp2) 

62 return parser.close() 

63 except xml.etree.ElementTree.ParseError as exc: 

64 raise error.ParseError(exc.args[0]) from exc 

65 

66 

67class _ParserTarget: 

68 def __init__(self, case_insensitive, compact_whitespace, normalize_namespaces, remove_namespaces): 

69 self.stack = [] 

70 self.root = None 

71 self.buf = [] 

72 self.case_insensitive = case_insensitive 

73 self.compact_whitespace = compact_whitespace 

74 self.remove_namespaces = remove_namespaces 

75 self.normalize_namespaces = normalize_namespaces 

76 

77 def convert_name(self, s: str) -> str: 

78 """"Converts a given XML-namespace or URI to a proper name. 

79 

80 Args: 

81 s: XML-namespace or URI. 

82 

83 Returns: 

84 ``{URI}properName`` 

85 if ``normalize_namespaces`` flag is True, ``{non-versionalized-URL}properName`` is returned, 

86 if ``remove_namespaces`` flag is True ``properName`` is returned. 

87 """ 

88 xmlns, uri, pname = namespace.split_name(s) 

89 pname = pname.lower() if self.case_insensitive else pname 

90 if not xmlns and not uri: 

91 return pname 

92 if self.remove_namespaces: 

93 return pname 

94 if self.normalize_namespaces: 

95 ns = namespace.find_by_uri(uri) 

96 if ns: 

97 uri = ns.uri 

98 return '{' + uri + '}' + pname 

99 

100 def make(self, tag: str, attrib: dict) -> gws.XmlElement: 

101 """Creates an Element. 

102 

103 Args: 

104 tag: The tag. 

105 attrib: ``{key:value}`` 

106 """ 

107 attrib2 = {} 

108 

109 if attrib: 

110 for name, val in attrib.items(): 

111 attrib2[self.convert_name(name)] = val 

112 

113 el = element.XmlElementImpl(self.convert_name(tag), attrib2) 

114 el.caseInsensitive = self.case_insensitive 

115 

116 return el 

117 

118 def flush(self): 

119 """Loads the buffer into the stack and clears the stack.""" 

120 if not self.buf: 

121 return 

122 

123 text = ''.join(self.buf) 

124 self.buf = [] 

125 

126 if self.compact_whitespace: 

127 text = ' '.join(text.strip().split()) 

128 

129 if text: 

130 top = self.stack[-1] 

131 if len(top) > 0: 

132 top[-1].tail = text 

133 else: 

134 top.text = text 

135 

136 ## 

137 

138 def start(self, tag: str, attrib: dict): 

139 """Flushes the buffer and appends an element to the stack. 

140 

141 Args: 

142 tag: Tag of the XML-element. 

143 attrib: Attribute of the XML-element. 

144 """ 

145 self.flush() 

146 el = self.make(tag, attrib) 

147 if self.stack: 

148 self.stack[-1].append(el) 

149 else: 

150 self.root = el 

151 self.stack.append(el) 

152 

153 def end(self, tag): 

154 """Flushes the buffer and pops the stack.""" 

155 self.flush() 

156 self.stack.pop() 

157 

158 def data(self, data): 

159 """Adds data to the buffer. 

160 

161 Args: 

162 data: data to add.""" 

163 self.buf.append(data) 

164 

165 def close(self): 

166 """Returns root.""" 

167 return self.root 

168 

169 

170def _decode_input(inp) -> str: 

171 # the problem is, we can receive a document 

172 # that is declared ISO-8859-1, but actually is UTF and vice versa. 

173 # therefore, don't let expat do the decoding, always give it a `str` 

174 # and remove the xml decl with the (possibly incorrect) encoding 

175 

176 if isinstance(inp, bytes): 

177 inp = inp.strip() 

178 

179 encodings = [] 

180 

181 if inp.startswith(b'<?xml'): 

182 try: 

183 end = inp.index(b'?>') 

184 except ValueError: 

185 raise error.ParseError('invalid XML declaration') 

186 

187 head = inp[:end].decode('ascii').lower() 

188 m = re.search(r'encoding\s*=\s*(\S+)', head) 

189 if m: 

190 encodings.append(m.group(1).strip('\'\"')) 

191 inp = inp[end + 2:] 

192 

193 # try the declared encoding, if any, then utf8, then latin 

194 

195 if 'utf8' not in encodings: 

196 encodings.append('utf8') 

197 if 'iso-8859-1' not in encodings: 

198 encodings.append('iso-8859-1') 

199 

200 for enc in encodings: 

201 try: 

202 return inp.decode(encoding=enc, errors='strict') 

203 except (LookupError, UnicodeDecodeError): 

204 pass 

205 

206 raise error.ParseError(f'invalid document encoding, tried {",".join(encodings)}') 

207 

208 if isinstance(inp, str): 

209 inp = inp.strip() 

210 

211 if inp.startswith('<?xml'): 

212 try: 

213 end = inp.index('?>') 

214 except ValueError: 

215 raise error.ParseError('invalid XML declaration') 

216 return inp[end + 2:] 

217 

218 return inp 

219 

220 raise error.ParseError(f'invalid input')