Coverage for gws-app/gws/lib/vendor/dog/builder.py: 0%
547 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-17 01:37 +0200
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-17 01:37 +0200
1from typing import List, Optional
3import re
4import os
5import json
6import fnmatch
7import shutil
8import mimetypes
10from . import util, template, markdown
11from .options import Options
14class ParseNode(util.Data):
15 pass
18class MarkdownNode(ParseNode):
19 el: markdown.Element
22class SectionNode(ParseNode):
23 sid: str
26class EmbedNode(ParseNode):
27 items: List[str]
28 sid: str
31class TocNode(ParseNode):
32 items: List[str]
33 sids: List[str]
34 depth: int
37class RawHtmlNode(ParseNode):
38 html: str
41class Section(util.Data):
42 sid: str
43 level: int
44 status: str
45 subSids: List[str]
46 parentSid: str
48 sourcePath: str
50 headText: str
51 headHtml: str
52 headHtmlLink: str
53 headNode: MarkdownNode
54 headLevel: int
56 nodes: List[ParseNode]
58 htmlPath: str
59 htmlUrl: str
60 htmlBaseUrl: str
61 htmlId: str
64class Builder:
65 options: Options
66 markdownParser: markdown.Markdown
67 htmlGenerator: 'HTMLGenerator'
68 docPaths: set[str]
69 assetPaths: set[str]
70 sectionMap: dict[str, Section]
71 sectionNotFound: set[str]
72 assetMap: dict[str, str]
74 def __init__(self, opts: Options | dict):
75 self.options = Options()
76 if isinstance(opts, Options):
77 opts = vars(opts)
78 for k, v in opts.items():
79 setattr(self.options, k, v)
81 util.log.set_level('DEBUG' if self.options.debug else 'INFO')
83 self.includeTemplate = ''
84 if self.options.includeTemplate:
85 self.includeTemplate = util.read_file(self.options.includeTemplate)
87 self.cache = {}
89 def collect_and_parse(self):
90 self.markdownParser = markdown.parser()
92 self.docPaths = set()
93 self.assetPaths = set()
94 self.sectionMap = {}
95 self.sectionNotFound = set()
96 self.assetMap = {}
98 self.collect_sources()
99 self.parse_all()
101 def build_html(self, write=False):
102 self.collect_and_parse()
103 if not self.sectionMap:
104 util.log.error('no sections, skip build_html')
105 return
106 self.generate_html(write=write)
107 if write:
108 util.log.info(f'HTML created in {self.options.outputDir!r}')
110 def build_pdf(self):
111 pdf_temp_dir = '/tmp/dog_pdf'
112 shutil.rmtree(pdf_temp_dir, ignore_errors=True)
114 pdf_opts = Options()
115 vars(pdf_opts).update(vars(self.options))
117 pdf_opts.fileSplitLevel = {'/': 0}
118 pdf_opts.outputDir = pdf_temp_dir
119 pdf_opts.webRoot = '.'
121 if self.options.pdfPageTemplate:
122 pdf_opts.pageTemplate = self.options.pdfPageTemplate
124 old_opts = self.options
125 self.options = pdf_opts
127 self.collect_and_parse()
128 if not self.sectionMap:
129 util.log.error('no sections, skip build_pdf')
130 return
131 self.generate_html(write=True)
133 self.options = old_opts
135 out_path = self.options.outputDir + '/index.pdf'
136 self.generate_pdf(pdf_temp_dir + '/index.html', out_path)
137 shutil.rmtree(pdf_temp_dir, ignore_errors=True)
139 util.log.info(f'PDF created in {out_path!r}')
141 def dump(self):
142 def _default(x):
143 d = dict(vars(x))
144 d['$'] = x.__class__.__name__
145 return d
147 self.collect_and_parse()
148 return json.dumps(
149 self.sectionMap, indent=4, sort_keys=True, ensure_ascii=False, default=_default)
151 ##
153 def collect_sources(self):
154 for dirname in self.options.docRoots:
155 self.collect_sources_from_dir(dirname)
157 def collect_sources_from_dir(self, dirname):
158 de: os.DirEntry
159 ex = self.options.excludeRegex
161 for de in os.scandir(dirname):
162 if de.name.startswith('.'):
163 pass
164 elif ex and re.search(ex, de.path):
165 util.log.debug(f'exclude: {de.path!r}')
166 elif de.is_dir():
167 self.collect_sources_from_dir(de.path)
168 elif de.is_file() and any(fnmatch.fnmatch(de.name, p) for p in self.options.docPatterns):
169 self.docPaths.add(de.path)
170 elif de.is_file() and any(fnmatch.fnmatch(de.name, p) for p in self.options.assetPatterns):
171 self.assetPaths.add(de.path)
173 ##
175 def get_section(self, sid: str) -> Optional[Section]:
176 if sid in self.sectionNotFound:
177 return
178 if sid not in self.sectionMap:
179 util.log.error(f'section not found: {sid!r}')
180 self.sectionNotFound.add(sid)
181 return
182 return self.sectionMap.get(sid)
184 def section_from_url(self, url) -> Optional[Section]:
185 for sec in self.sectionMap.values():
186 if sec.htmlBaseUrl == url:
187 return sec
189 def section_from_element(self, el: markdown.Element) -> Optional[Section]:
190 for sec in self.sectionMap.values():
191 if sec.headNode.el == el:
192 return sec
194 def sections_from_wildcard_sid(self, sid, parent_sec) -> List[Section]:
195 abs_sid = self.make_sid(sid, parent_sec.sid, '', '')
197 if not abs_sid:
198 util.log.error(f'invalid section id {sid!r} in {parent_sec.sourcePath!r}')
199 return []
201 if '*' not in abs_sid:
202 sub = self.get_section(abs_sid)
203 if sub:
204 return [sub]
205 return []
207 rx = abs_sid.replace('*', '[^/]+') + '$'
208 subs = [
209 sec
210 for sec in self.sectionMap.values()
211 if re.match(rx, sec.sid)
212 ]
213 return sorted(subs, key=lambda sec: sec.headText)
215 ##
217 def generate_html(self, write):
218 self.assetMap = {}
220 for path in self.options.extraAssets:
221 self.add_asset(path)
223 self.htmlGenerator = HTMLGenerator(self)
224 self.htmlGenerator.render_section_heads()
225 self.htmlGenerator.render_sections()
226 self.htmlGenerator.flush()
228 if write:
229 self.htmlGenerator.write()
230 self.write_assets()
232 util.write_file(
233 str(os.path.join(self.options.outputDir, self.options.staticDir, self.GLOBAL_TOC_SCRIPT)),
234 self.generate_global_toc())
236 util.write_file(
237 str(os.path.join(self.options.outputDir, self.options.staticDir, self.SEARCH_INDEX_SCRIPT)),
238 self.generate_search_index())
240 def generate_pdf(self, source: str, target: str):
241 cmd = [
242 'wkhtmltopdf',
243 '--outline',
244 '--enable-local-file-access',
245 '--print-media-type',
246 '--disable-javascript',
247 ]
249 if self.options.pdfOptions:
250 for k, v in self.options.pdfOptions.items():
251 cmd.append(f'--{k}')
252 if v is not True:
253 cmd.append(str(v))
255 cmd.append(source)
256 cmd.append(target)
258 util.run(cmd, pipe=True)
260 ##
262 GLOBAL_TOC_SCRIPT = '_global_toc.js'
263 SEARCH_INDEX_SCRIPT = '_search_index.js'
265 def generate_global_toc(self):
266 js = {
267 sec.sid: {
268 'h': sec.headText,
269 'u': sec.htmlUrl,
270 'p': '',
271 's': sec.subSids
272 }
273 for sec in self.sectionMap.values()
274 }
275 for sec in self.sectionMap.values():
276 for sub in sec.subSids:
277 node = js.get(sub)
278 if node:
279 node['p'] = sec.sid
281 return 'GLOBAL_TOC = ' + json.dumps(js, ensure_ascii=False, indent=4) + '\n'
283 def generate_search_index(self):
284 words_map = {}
286 for sec in self.sectionMap.values():
287 words_map[sec.sid] = []
288 for node in sec.nodes:
289 if isinstance(node, MarkdownNode):
290 self.extract_text(node.el, words_map[sec.sid])
292 for sid, words in words_map.items():
293 ws = ' '.join(words)
294 ws = ws.replace("'", '')
295 ws = re.sub(r'\W+', ' ', ws).lower().strip()
296 words_map[sid] = ws.split()
298 all_words = sorted(set(w for ws in words_map.values() for w in ws))
299 word_index = {w: n for n, w in enumerate(all_words, 1)}
301 sections = []
302 for sid, words in words_map.items():
303 sec = self.sectionMap[sid]
304 head = sec.headHtml
305 if sec.parentSid:
306 parent = self.sectionMap[sec.parentSid]
307 head += ' (' + parent.headHtml + ')'
308 sections.append({
309 'h': head,
310 'u': sec.htmlUrl,
311 'w': '.' + '.'.join(util.base36(word_index[w]) for w in words) + '.'
312 })
314 js = {
315 'words': '.' + '.'.join(all_words),
316 'sections': sorted(sections, key=lambda s: s['h']),
317 }
319 return 'SEARCH_INDEX = ' + json.dumps(js, ensure_ascii=False, indent=4) + '\n'
321 def extract_text(self, el: markdown.Element, out: list):
322 if el.text:
323 out.append(el.text)
324 return
325 if el.children:
326 for c in el.children:
327 self.extract_text(c, out)
328 out.append('.')
330 ##
332 def content_for_url(self, url):
333 if url.endswith('.html'):
334 sec = self.section_from_url(url)
335 if sec:
336 return 'text/html', self.htmlGenerator.content[sec.htmlPath]
337 return
339 m = re.search(self.options.staticDir + '/(.+)$', url)
340 if not m:
341 return
343 fn = m.group(1)
344 if fn.endswith(self.GLOBAL_TOC_SCRIPT):
345 return 'application/javascript', self.generate_global_toc()
346 if fn.endswith(self.SEARCH_INDEX_SCRIPT):
347 attr = '_CACHED_SEARCH_INDEX'
348 if not hasattr(self, attr):
349 setattr(self, attr, self.generate_search_index())
350 return 'application/javascript', getattr(self, attr)
352 for path, fname in self.assetMap.items():
353 if fname == fn:
354 mt = mimetypes.guess_type(path)
355 return mt[0] if mt else 'text/plain', util.read_file_b(path)
357 def add_asset(self, path):
358 if path not in self.assetMap:
359 self.assetMap[path] = self.unique_asset_filename(path)
360 return self.options.webRoot + '/' + self.options.staticDir + '/' + self.assetMap[path]
362 def unique_asset_filename(self, path):
363 fnames = set(self.assetMap.values())
364 fname = os.path.basename(path)
365 if fname not in fnames:
366 return fname
367 n = 1
368 while True:
369 base, ext = fname.split('.')
370 fname2 = f'{base}-{n}.{ext}'
371 if fname2 not in fnames:
372 return fname2
373 n += 1
375 def write_assets(self):
376 for src, fname in self.assetMap.items():
377 dst = str(os.path.join(self.options.outputDir, self.options.staticDir, fname))
378 util.log.debug(f'copy {src!r} => {dst!r}')
379 util.write_file_b(dst, util.read_file_b(src))
381 ##
383 def parse_all(self):
384 self.sectionMap = {}
386 for path in self.docPaths:
387 for sec in self.parse_file(path):
388 prev = self.sectionMap.get(sec.sid)
389 if prev:
390 util.log.warning(f'section redefined {sec.sid!r} from {prev.sourcePath!r} in {sec.sourcePath!r}')
391 self.sectionMap[sec.sid] = sec
393 root = self.sectionMap.get('/')
394 if not root:
395 util.log.error('no root section found')
396 self.sectionMap = {}
397 return
399 new_map = {}
400 self.make_tree(root, None, new_map)
402 for sec in self.sectionMap.values():
403 if sec.sid not in new_map:
404 util.log.warning(f'unbound section {sec.sid!r} in {sec.sourcePath!r}')
405 continue
407 self.sectionMap = new_map
409 for sec in self.sectionMap.values():
410 self.expand_toc_nodes(sec)
412 self.add_url_and_path(root, 0)
414 def parse_file(self, path):
415 return FileParser(self, path).sections()
417 def make_tree(self, sec: Section, parent_sec: Section | None, new_map):
418 if parent_sec:
419 if sec.parentSid:
420 util.log.warning(f'rebinding section {sec.sid!r} from {sec.parentSid!r} to {parent_sec.sid!r}')
421 sec.parentSid = parent_sec.sid
423 if sec.status == 'ok':
424 return
426 if sec.status == 'walk':
427 util.log.error(f'circular dependency in {sec.sid!r}')
428 return
430 sec.status = 'walk'
432 sub_sids: list[str] = []
433 new_nodes: list[ParseNode] = []
434 new_map[sec.sid] = sec
436 for node in sec.nodes:
438 if isinstance(node, SectionNode):
439 sub = self.get_section(node.sid)
440 if sub:
441 self.make_tree(sub, sec, new_map)
442 sub_sids.append(sub.sid)
443 new_nodes.append(node)
444 continue
446 if isinstance(node, EmbedNode):
447 secs = self.sections_from_wildcard_sid(node.sid, sec)
448 for sub in secs:
449 self.make_tree(sub, sec, new_map)
450 sub_sids.append(sub.sid)
451 new_nodes.append(SectionNode(sid=sub.sid))
452 continue
454 new_nodes.append(node)
456 sec.nodes = new_nodes
457 sec.subSids = sub_sids
458 sec.status = 'ok'
460 def expand_toc_nodes(self, sec: Section):
461 for node in sec.nodes:
462 if isinstance(node, TocNode):
463 sids = []
464 for sid in node.items:
465 secs = self.sections_from_wildcard_sid(sid, sec)
466 sids.extend(s.sid for s in secs)
467 node.sids = sids
469 def add_url_and_path(self, sec: Section, split_level):
470 if sec.sid in self.options.fileSplitLevel:
471 split_level = self.options.fileSplitLevel[sec.sid]
473 parts = sec.sid.split('/')[1:]
475 if sec.level == 0 or split_level == 0:
476 path = 'index.html'
477 else:
478 dirname = '/'.join(parts[:split_level])
479 path = dirname + '/index.html'
481 sec.htmlId = '-'.join(parts[split_level:])
482 sec.htmlPath = self.options.outputDir + '/' + path
483 sec.htmlBaseUrl = self.options.webRoot + '/' + path
485 util.log.debug(f'path {sec.sid} -> {sec.htmlPath} ({split_level})')
487 sec.htmlUrl = sec.htmlBaseUrl
488 if sec.htmlId:
489 sec.htmlUrl += '#' + sec.htmlId
491 sec.headLevel = max(1, sec.level - split_level + 1)
493 for sub in sec.subSids:
494 sub = self.sectionMap[sub]
495 self.add_url_and_path(sub, split_level)
497 def make_sid(self, explicit_sid, parent_sid, prev_sid=None, text=None):
499 explicit_sid = explicit_sid or ''
500 text_sid = util.to_uid(text) if text else ''
502 if explicit_sid == '/':
503 return '/'
505 sid = explicit_sid or text_sid
506 if sid.endswith('/'):
507 sid += text_sid
508 if not sid or sid.endswith('/'):
509 return ''
511 if sid.startswith('/'):
512 return util.normpath(sid)
514 if parent_sid:
515 return util.normpath(parent_sid + '/' + sid)
517 if prev_sid:
518 ps, _, _ = prev_sid.rpartition('/')
519 return util.normpath(ps + '/' + sid)
521 return ''
523 ##
525 def cached(self, key, fn):
526 if key not in self.cache:
527 self.cache[key] = fn()
528 return self.cache[key]
531class FileParser:
532 def __init__(self, b: Builder, path):
533 self.b = b
534 self.path = path
536 def sections(self) -> List[Section]:
537 util.log.debug(f'parse {self.path!r}')
539 sections = []
541 dummy_root = Section(
542 sid='',
543 nodes=[],
544 level=-1,
545 headNode=MarkdownNode(el=markdown.Element(level=-1)))
546 stack = [dummy_root]
548 el: markdown.Element
549 for el in self.parse():
551 if el.type == 'heading':
552 prev_sec = None
553 while stack[-1].headNode.el.level > el.level:
554 stack.pop()
555 if stack[-1].headNode.el.level == el.level:
556 prev_sec = stack.pop()
558 sec = self.parse_heading(el, stack[-1], prev_sec)
559 if sec:
560 stack.append(sec)
561 sections.append(sec)
563 continue
565 if el.type == 'block_code' and el.text.startswith(template.GENERATED_NODE):
566 args = json.loads(el.text[len(template.GENERATED_NODE):])
567 cls = globals()[args.pop('class')]
568 stack[-1].nodes.append(cls(**args))
569 continue
571 stack[-1].nodes.append(MarkdownNode(el=el))
573 return sections
575 def parse(self) -> list[markdown.Element]:
576 text = self.b.includeTemplate + util.read_file(self.path)
577 text = template.render(self.b, text, self.path, {
578 'options': self.b.options,
579 'builder': self.b,
580 })
581 if not text:
582 return []
583 return self.b.markdownParser(text)
585 def parse_heading(self, el: markdown.Element, parent_sec, prev_sec):
586 explicit_sid = self.extract_explicit_sid(el)
587 text = markdown.text_from_element(el)
589 sid = self.b.make_sid(
590 explicit_sid,
591 parent_sec.sid,
592 prev_sec.sid if prev_sec else None,
593 text
594 )
596 if not sid and (el.level == 1 and text and not explicit_sid):
597 util.log.debug(f'creating implicit root section {text!r} in {self.path!r}')
598 sid = '/'
600 if not sid:
601 util.log.error(f'invalid section id for {text!r}:{explicit_sid!r} in {self.path!r}')
602 return
604 if not text:
605 parent_sec.nodes.append(EmbedNode(sid=sid))
606 return
608 parent_sec.nodes.append(SectionNode(sid=sid))
609 el.sid = sid
610 head_node = MarkdownNode(el=el)
612 return Section(
613 sid=sid,
614 level=0 if sid == '/' else sid.count('/'),
615 status='',
616 sourcePath=self.path,
617 headText=text,
618 headNode=head_node,
619 nodes=[head_node],
620 )
622 def extract_explicit_sid(self, el: markdown.Element) -> str:
623 ch = el.children
625 if not ch or ch[-1].type != 'text':
626 return ''
628 m = re.match(r'^(.*?):(\S+)$', ch[-1].text)
629 if not m:
630 return ''
632 ch[-1].text = m.group(1)
633 markdown.strip_text_content(el)
635 return m.group(2)
638class HTMLGenerator:
639 def __init__(self, b: Builder):
640 self.b = b
641 self.buffers = {}
642 self.content = {}
644 def render_section_heads(self):
645 for sec in self.b.sectionMap.values():
646 mr = MarkdownRenderer(self.b, sec)
647 sec.headHtml = mr.render_children(sec.headNode.el)
648 sec.headHtmlLink = f'<a href="{sec.htmlUrl}">{sec.headHtml}</a>'
650 def render_sections(self):
651 for sec in self.b.sectionMap.values():
652 if not sec.parentSid:
653 self.render_section(sec.sid)
655 def render_section(self, sid):
656 sec = self.b.get_section(sid)
657 if not sec:
658 return
660 util.log.debug(f'render {sid!r}')
662 mr = MarkdownRenderer(self.b, sec)
664 self.add(sec, f'<section id="{sec.htmlId}" data-sid="{sec.sid}">\n')
666 for node in sec.nodes:
667 if isinstance(node, MarkdownNode):
668 html = mr.render_element(node.el)
669 self.add(sec, html)
670 continue
671 if isinstance(node, SectionNode):
672 self.render_section(node.sid)
673 continue
674 if isinstance(node, TocNode):
675 entries = ''.join(self.render_toc_entry(sid, node.depth) for sid in node.sids)
676 html = f'<div class="localtoc"><ul>{entries}</ul></div>'
677 self.add(sec, html)
678 continue
679 if isinstance(node, RawHtmlNode):
680 self.add(sec, node.html)
681 continue
683 self.add(sec, f'</section>\n')
685 def render_toc_entry(self, sid, depth: int):
686 sec = self.b.get_section(sid)
687 if not sec:
688 return
690 s = ''
691 if depth > 1:
692 sub = [self.render_toc_entry(s, depth - 1) for s in sec.subSids]
693 if sub:
694 s = '<ul>' + ''.join(sub) + '</ul>'
696 return f'<li data-sid="{sid}">{sec.headHtmlLink}{s}</li>'
698 def render_main_toc(self):
699 root = self.b.get_section('/')
700 if not root:
701 return
702 return '\n'.join(
703 self.render_toc_entry(sid, 999)
704 for sid in root.subSids
705 )
707 def add(self, sec: Section, html):
708 if sec.htmlPath not in self.buffers:
709 self.buffers[sec.htmlPath] = util.Data(sids=[], html=[])
710 self.buffers[sec.htmlPath].sids.append(sec.sid)
711 self.buffers[sec.htmlPath].html.append(html)
713 def flush(self):
714 tpl = template.compile(self.b, self.b.options.pageTemplate)
716 self.content = {}
718 home_url = ''
719 sec = self.b.get_section('/')
720 if sec:
721 home_url = sec.htmlUrl
723 for path, buf in self.buffers.items():
724 self.content[path] = template.call(self.b, tpl, {
725 'path': path,
726 'title': self.b.options.title,
727 'subTitle': self.b.options.subTitle,
728 'main': ''.join(buf.html),
729 'breadcrumbs': self.get_breadcrumbs(buf.sids[0]),
730 'home': home_url,
731 'builder': self.b,
732 'options': self.b.options,
733 })
735 def write(self):
736 for path, html in self.content.items():
737 util.log.debug(f'write {path!r}')
738 util.write_file(path, html)
740 def get_breadcrumbs(self, sid):
741 sec = self.b.get_section(sid)
742 if not sec:
743 return []
745 bs = []
747 while sec:
748 bs.insert(0, (sec.htmlUrl, sec.headHtml))
749 if not sec.parentSid:
750 break
751 sec = self.b.get_section(sec.parentSid)
753 return bs
756class MarkdownRenderer(markdown.Renderer):
758 def __init__(self, b: Builder, sec: Section):
759 self.b = b
760 self.sec = sec
762 def link_render(self, el: markdown.Element):
763 c = self.render_children(el)
764 link = el.link
765 if link.startswith(('http:', 'https:')):
766 return self.render_a(link, el.title, c, el)
767 if link.startswith('//'):
768 return self.render_a(link[1:], el.title, c, el)
770 sid = self.b.make_sid(link, self.sec.sid)
771 target = self.b.get_section(sid)
772 if not target:
773 return self.render_a(link, el.title, c, el)
774 return self.render_a(
775 target.htmlUrl,
776 el.title or target.headText,
777 c or target.headHtml,
778 el
779 )
781 def image_render(self, el: markdown.Element):
782 if not el.src:
783 return ''
784 if el.src.startswith(('http:', 'https:')):
785 return super().image_render(el)
786 paths = [path for path in self.b.assetPaths if path.endswith(el.src)]
787 if not paths:
788 util.log.error(f'asset not found: {el.src!r} ')
789 el.src = ''
790 return super().image_render(el)
791 el.src = self.b.add_asset(paths[0])
792 return super().image_render(el)
794 def heading_render(self, el: markdown.Element):
795 sec = self.b.section_from_element(el)
796 if not sec:
797 return
798 c = self.render_children(el)
799 tag = 'h' + str(sec.headLevel)
800 a = {'data-url': sec.htmlUrl}
801 if self.b.options.debug:
802 a['title'] = markdown.escape(sec.sourcePath)
803 return f'<{tag}{markdown.attributes(a)}>{c}</{tag}>\n'