Coverage for gws-app/gws/lib/vendor/dog/builder.py: 0%

1from typing import List, Optional

3import re

4import os

5import json

6import fnmatch

7import shutil

8import mimetypes

10from . import util, template, markdown

11from .options import Options

14class ParseNode(util.Data):

15 pass

18class MarkdownNode(ParseNode):

19 el: markdown.Element

22class SectionNode(ParseNode):

23 sid: str

26class EmbedNode(ParseNode):

27 items: List[str]

28 sid: str

31class TocNode(ParseNode):

32 items: List[str]

33 sids: List[str]

34 depth: int

37class RawHtmlNode(ParseNode):

38 html: str

41class Section(util.Data):

42 sid: str

43 level: int

44 status: str

45 subSids: List[str]

46 parentSid: str

48 sourcePath: str

50 headText: str

51 headHtml: str

52 headHtmlLink: str

53 headNode: MarkdownNode

54 headLevel: int

56 nodes: List[ParseNode]

58 htmlPath: str

59 htmlUrl: str

60 htmlBaseUrl: str

61 htmlId: str

64class Builder:

65 options: Options

66 markdownParser: markdown.Markdown

67 htmlGenerator: 'HTMLGenerator'

68 docPaths: set[str]

69 assetPaths: set[str]

70 sectionMap: dict[str, Section]

71 sectionNotFound: set[str]

72 assetMap: dict[str, str]

74 def __init__(self, opts: Options | dict):

75 self.options = Options()

76 if isinstance(opts, Options):

77 opts = vars(opts)

78 for k, v in opts.items():

79 setattr(self.options, k, v)

81 util.log.set_level('DEBUG' if self.options.debug else 'INFO')

83 self.includeTemplate = ''

84 if self.options.includeTemplate:

85 self.includeTemplate = util.read_file(self.options.includeTemplate)

87 self.cache = {}

89 def collect_and_parse(self):

90 self.markdownParser = markdown.parser()

92 self.docPaths = set()

93 self.assetPaths = set()

94 self.sectionMap = {}

95 self.sectionNotFound = set()

96 self.assetMap = {}

98 self.collect_sources()

99 self.parse_all()

100

101 def build_html(self, write=False):

102 self.collect_and_parse()

103 if not self.sectionMap:

104 util.log.error('no sections, skip build_html')

105 return

106 self.generate_html(write=write)

107 if write:

108 util.log.info(f'HTML created in {self.options.outputDir!r}')

109

110 def build_pdf(self):

111 pdf_temp_dir = '/tmp/dog_pdf'

112 shutil.rmtree(pdf_temp_dir, ignore_errors=True)

113

114 pdf_opts = Options()

115 vars(pdf_opts).update(vars(self.options))

116

117 pdf_opts.fileSplitLevel = {'/': 0}

118 pdf_opts.outputDir = pdf_temp_dir

119 pdf_opts.webRoot = '.'

120

121 if self.options.pdfPageTemplate:

122 pdf_opts.pageTemplate = self.options.pdfPageTemplate

123

124 old_opts = self.options

125 self.options = pdf_opts

126

127 self.collect_and_parse()

128 if not self.sectionMap:

129 util.log.error('no sections, skip build_pdf')

130 return

131 self.generate_html(write=True)

132

133 self.options = old_opts

134

135 out_path = self.options.outputDir + '/index.pdf'

136 self.generate_pdf(pdf_temp_dir + '/index.html', out_path)

137 shutil.rmtree(pdf_temp_dir, ignore_errors=True)

138

139 util.log.info(f'PDF created in {out_path!r}')

140

141 def dump(self):

142 def _default(x):

143 d = dict(vars(x))

144 d['$'] = x.__class__.__name__

145 return d

146

147 self.collect_and_parse()

148 return json.dumps(

149 self.sectionMap, indent=4, sort_keys=True, ensure_ascii=False, default=_default)

150

151 ##

152

153 def collect_sources(self):

154 for dirname in self.options.docRoots:

155 self.collect_sources_from_dir(dirname)

156

157 def collect_sources_from_dir(self, dirname):

158 de: os.DirEntry

159 ex = self.options.excludeRegex

160

161 for de in os.scandir(dirname):

162 if de.name.startswith('.'):

163 pass

164 elif ex and re.search(ex, de.path):

165 util.log.debug(f'exclude: {de.path!r}')

166 elif de.is_dir():

167 self.collect_sources_from_dir(de.path)

168 elif de.is_file() and any(fnmatch.fnmatch(de.name, p) for p in self.options.docPatterns):

169 self.docPaths.add(de.path)

170 elif de.is_file() and any(fnmatch.fnmatch(de.name, p) for p in self.options.assetPatterns):

171 self.assetPaths.add(de.path)

172

173 ##

174

175 def get_section(self, sid: str) -> Optional[Section]:

176 if sid in self.sectionNotFound:

177 return

178 if sid not in self.sectionMap:

179 util.log.error(f'section not found: {sid!r}')

180 self.sectionNotFound.add(sid)

181 return

182 return self.sectionMap.get(sid)

183

184 def section_from_url(self, url) -> Optional[Section]:

185 for sec in self.sectionMap.values():

186 if sec.htmlBaseUrl == url:

187 return sec

188

189 def section_from_element(self, el: markdown.Element) -> Optional[Section]:

190 for sec in self.sectionMap.values():

191 if sec.headNode.el == el:

192 return sec

193

194 def sections_from_wildcard_sid(self, sid, parent_sec) -> List[Section]:

195 abs_sid = self.make_sid(sid, parent_sec.sid, '', '')

196

197 if not abs_sid:

198 util.log.error(f'invalid section id {sid!r} in {parent_sec.sourcePath!r}')

199 return []

200

201 if '*' not in abs_sid:

202 sub = self.get_section(abs_sid)

203 if sub:

204 return [sub]

205 return []

206

207 rx = abs_sid.replace('*', '[^/]+') + '$'

208 subs = [

209 sec

210 for sec in self.sectionMap.values()

211 if re.match(rx, sec.sid)

212 ]

213 return sorted(subs, key=lambda sec: sec.headText)

214

215 ##

216

217 def generate_html(self, write):

218 self.assetMap = {}

219

220 for path in self.options.extraAssets:

221 self.add_asset(path)

222

223 self.htmlGenerator = HTMLGenerator(self)

224 self.htmlGenerator.render_section_heads()

225 self.htmlGenerator.render_sections()

226 self.htmlGenerator.flush()

227

228 if write:

229 self.htmlGenerator.write()

230 self.write_assets()

231

232 util.write_file(

233 str(os.path.join(self.options.outputDir, self.options.staticDir, self.GLOBAL_TOC_SCRIPT)),

234 self.generate_global_toc())

235

236 util.write_file(

237 str(os.path.join(self.options.outputDir, self.options.staticDir, self.SEARCH_INDEX_SCRIPT)),

238 self.generate_search_index())

239

240 def generate_pdf(self, source: str, target: str):

241 cmd = [

242 'wkhtmltopdf',

243 '--outline',

244 '--enable-local-file-access',

245 '--print-media-type',

246 '--disable-javascript',

247 ]

248

249 if self.options.pdfOptions:

250 for k, v in self.options.pdfOptions.items():

251 cmd.append(f'--{k}')

252 if v is not True:

253 cmd.append(str(v))

254

255 cmd.append(source)

256 cmd.append(target)

257

258 util.run(cmd, pipe=True)

259

260 ##

261

262 GLOBAL_TOC_SCRIPT = '_global_toc.js'

263 SEARCH_INDEX_SCRIPT = '_search_index.js'

264

265 def generate_global_toc(self):

266 js = {

267 sec.sid: {

268 'h': sec.headText,

269 'u': sec.htmlUrl,

270 'p': '',

271 's': sec.subSids

272 }

273 for sec in self.sectionMap.values()

274 }

275 for sec in self.sectionMap.values():

276 for sub in sec.subSids:

277 node = js.get(sub)

278 if node:

279 node['p'] = sec.sid

280

281 return 'GLOBAL_TOC = ' + json.dumps(js, ensure_ascii=False, indent=4) + '\n'

282

283 def generate_search_index(self):

284 words_map = {}

285

286 for sec in self.sectionMap.values():

287 words_map[sec.sid] = []

288 for node in sec.nodes:

289 if isinstance(node, MarkdownNode):

290 self.extract_text(node.el, words_map[sec.sid])

291

292 for sid, words in words_map.items():

293 ws = ' '.join(words)

294 ws = ws.replace("'", '')

295 ws = re.sub(r'\W+', ' ', ws).lower().strip()

296 words_map[sid] = ws.split()

297

298 all_words = sorted(set(w for ws in words_map.values() for w in ws))

299 word_index = {w: n for n, w in enumerate(all_words, 1)}

300

301 sections = []

302 for sid, words in words_map.items():

303 sec = self.sectionMap[sid]

304 head = sec.headHtml

305 if sec.parentSid:

306 parent = self.sectionMap[sec.parentSid]

307 head += ' (' + parent.headHtml + ')'

308 sections.append({

309 'h': head,

310 'u': sec.htmlUrl,

311 'w': '.' + '.'.join(util.base36(word_index[w]) for w in words) + '.'

312 })

313

314 js = {

315 'words': '.' + '.'.join(all_words),

316 'sections': sorted(sections, key=lambda s: s['h']),

317 }

318

319 return 'SEARCH_INDEX = ' + json.dumps(js, ensure_ascii=False, indent=4) + '\n'

320

321 def extract_text(self, el: markdown.Element, out: list):

322 if el.text:

323 out.append(el.text)

324 return

325 if el.children:

326 for c in el.children:

327 self.extract_text(c, out)

328 out.append('.')

329

330 ##

331

332 def content_for_url(self, url):

333 if url.endswith('.html'):

334 sec = self.section_from_url(url)

335 if sec:

336 return 'text/html', self.htmlGenerator.content[sec.htmlPath]

337 return

338

339 m = re.search(self.options.staticDir + '/(.+)$', url)

340 if not m:

341 return

342

343 fn = m.group(1)

344 if fn.endswith(self.GLOBAL_TOC_SCRIPT):

345 return 'application/javascript', self.generate_global_toc()

346 if fn.endswith(self.SEARCH_INDEX_SCRIPT):

347 attr = '_CACHED_SEARCH_INDEX'

348 if not hasattr(self, attr):

349 setattr(self, attr, self.generate_search_index())

350 return 'application/javascript', getattr(self, attr)

351

352 for path, fname in self.assetMap.items():

353 if fname == fn:

354 mt = mimetypes.guess_type(path)

355 return mt[0] if mt else 'text/plain', util.read_file_b(path)

356

357 def add_asset(self, path):

358 if path not in self.assetMap:

359 self.assetMap[path] = self.unique_asset_filename(path)

360 return self.options.webRoot + '/' + self.options.staticDir + '/' + self.assetMap[path]

361

362 def unique_asset_filename(self, path):

363 fnames = set(self.assetMap.values())

364 fname = os.path.basename(path)

365 if fname not in fnames:

366 return fname

367 n = 1

368 while True:

369 base, ext = fname.split('.')

370 fname2 = f'{base}-{n}.{ext}'

371 if fname2 not in fnames:

372 return fname2

373 n += 1

374

375 def write_assets(self):

376 for src, fname in self.assetMap.items():

377 dst = str(os.path.join(self.options.outputDir, self.options.staticDir, fname))

378 util.log.debug(f'copy {src!r} => {dst!r}')

379 util.write_file_b(dst, util.read_file_b(src))

380

381 ##

382

383 def parse_all(self):

384 self.sectionMap = {}

385

386 for path in self.docPaths:

387 for sec in self.parse_file(path):

388 prev = self.sectionMap.get(sec.sid)

389 if prev:

390 util.log.warning(f'section redefined {sec.sid!r} from {prev.sourcePath!r} in {sec.sourcePath!r}')

391 self.sectionMap[sec.sid] = sec

392

393 root = self.sectionMap.get('/')

394 if not root:

395 util.log.error('no root section found')

396 self.sectionMap = {}

397 return

398

399 new_map = {}

400 self.make_tree(root, None, new_map)

401

402 for sec in self.sectionMap.values():

403 if sec.sid not in new_map:

404 util.log.warning(f'unbound section {sec.sid!r} in {sec.sourcePath!r}')

405 continue

406

407 self.sectionMap = new_map

408

409 for sec in self.sectionMap.values():

410 self.expand_toc_nodes(sec)

411

412 self.add_url_and_path(root, 0)

413

414 def parse_file(self, path):

415 return FileParser(self, path).sections()

416

417 def make_tree(self, sec: Section, parent_sec: Section | None, new_map):

418 if parent_sec:

419 if sec.parentSid:

420 util.log.warning(f'rebinding section {sec.sid!r} from {sec.parentSid!r} to {parent_sec.sid!r}')

421 sec.parentSid = parent_sec.sid

422

423 if sec.status == 'ok':

424 return

425

426 if sec.status == 'walk':

427 util.log.error(f'circular dependency in {sec.sid!r}')

428 return

429

430 sec.status = 'walk'

431

432 sub_sids: list[str] = []

433 new_nodes: list[ParseNode] = []

434 new_map[sec.sid] = sec

435

436 for node in sec.nodes:

437

438 if isinstance(node, SectionNode):

439 sub = self.get_section(node.sid)

440 if sub:

441 self.make_tree(sub, sec, new_map)

442 sub_sids.append(sub.sid)

443 new_nodes.append(node)

444 continue

445

446 if isinstance(node, EmbedNode):

447 secs = self.sections_from_wildcard_sid(node.sid, sec)

448 for sub in secs:

449 self.make_tree(sub, sec, new_map)

450 sub_sids.append(sub.sid)

451 new_nodes.append(SectionNode(sid=sub.sid))

452 continue

453

454 new_nodes.append(node)

455

456 sec.nodes = new_nodes

457 sec.subSids = sub_sids

458 sec.status = 'ok'

459

460 def expand_toc_nodes(self, sec: Section):

461 for node in sec.nodes:

462 if isinstance(node, TocNode):

463 sids = []

464 for sid in node.items:

465 secs = self.sections_from_wildcard_sid(sid, sec)

466 sids.extend(s.sid for s in secs)

467 node.sids = sids

468

469 def add_url_and_path(self, sec: Section, split_level):

470 if sec.sid in self.options.fileSplitLevel:

471 split_level = self.options.fileSplitLevel[sec.sid]

472

473 parts = sec.sid.split('/')[1:]

474

475 if sec.level == 0 or split_level == 0:

476 path = 'index.html'

477 else:

478 dirname = '/'.join(parts[:split_level])

479 path = dirname + '/index.html'

480

481 sec.htmlId = '-'.join(parts[split_level:])

482 sec.htmlPath = self.options.outputDir + '/' + path

483 sec.htmlBaseUrl = self.options.webRoot + '/' + path

484

485 util.log.debug(f'path {sec.sid} -> {sec.htmlPath} ({split_level})')

486

487 sec.htmlUrl = sec.htmlBaseUrl

488 if sec.htmlId:

489 sec.htmlUrl += '#' + sec.htmlId

490

491 sec.headLevel = max(1, sec.level - split_level + 1)

492

493 for sub in sec.subSids:

494 sub = self.sectionMap[sub]

495 self.add_url_and_path(sub, split_level)

496

497 def make_sid(self, explicit_sid, parent_sid, prev_sid=None, text=None):

498

499 explicit_sid = explicit_sid or ''

500 text_sid = util.to_uid(text) if text else ''

501

502 if explicit_sid == '/':

503 return '/'

504

505 sid = explicit_sid or text_sid

506 if sid.endswith('/'):

507 sid += text_sid

508 if not sid or sid.endswith('/'):

509 return ''

510

511 if sid.startswith('/'):

512 return util.normpath(sid)

513

514 if parent_sid:

515 return util.normpath(parent_sid + '/' + sid)

516

517 if prev_sid:

518 ps, _, _ = prev_sid.rpartition('/')

519 return util.normpath(ps + '/' + sid)

520

521 return ''

522

523 ##

524

525 def cached(self, key, fn):

526 if key not in self.cache:

527 self.cache[key] = fn()

528 return self.cache[key]

529

530

531class FileParser:

532 def __init__(self, b: Builder, path):

533 self.b = b

534 self.path = path

535

536 def sections(self) -> List[Section]:

537 util.log.debug(f'parse {self.path!r}')

538

539 sections = []

540

541 dummy_root = Section(

542 sid='',

543 nodes=[],

544 level=-1,

545 headNode=MarkdownNode(el=markdown.Element(level=-1)))

546 stack = [dummy_root]

547

548 el: markdown.Element

549 for el in self.parse():

550

551 if el.type == 'heading':

552 prev_sec = None

553 while stack[-1].headNode.el.level > el.level:

554 stack.pop()

555 if stack[-1].headNode.el.level == el.level:

556 prev_sec = stack.pop()

557

558 sec = self.parse_heading(el, stack[-1], prev_sec)

559 if sec:

560 stack.append(sec)

561 sections.append(sec)

562

563 continue

564

565 if el.type == 'block_code' and el.text.startswith(template.GENERATED_NODE):

566 args = json.loads(el.text[len(template.GENERATED_NODE):])

567 cls = globals()[args.pop('class')]

568 stack[-1].nodes.append(cls(**args))

569 continue

570

571 stack[-1].nodes.append(MarkdownNode(el=el))

572

573 return sections

574

575 def parse(self) -> list[markdown.Element]:

576 text = self.b.includeTemplate + util.read_file(self.path)

577 text = template.render(self.b, text, self.path, {

578 'options': self.b.options,

579 'builder': self.b,

580 })

581 if not text:

582 return []

583 return self.b.markdownParser(text)

584

585 def parse_heading(self, el: markdown.Element, parent_sec, prev_sec):

586 explicit_sid = self.extract_explicit_sid(el)

587 text = markdown.text_from_element(el)

588

589 sid = self.b.make_sid(

590 explicit_sid,

591 parent_sec.sid,

592 prev_sec.sid if prev_sec else None,

593 text

594 )

595

596 if not sid and (el.level == 1 and text and not explicit_sid):

597 util.log.debug(f'creating implicit root section {text!r} in {self.path!r}')

598 sid = '/'

599

600 if not sid:

601 util.log.error(f'invalid section id for {text!r}:{explicit_sid!r} in {self.path!r}')

602 return

603

604 if not text:

605 parent_sec.nodes.append(EmbedNode(sid=sid))

606 return

607

608 parent_sec.nodes.append(SectionNode(sid=sid))

609 el.sid = sid

610 head_node = MarkdownNode(el=el)

611

612 return Section(

613 sid=sid,

614 level=0 if sid == '/' else sid.count('/'),

615 status='',

616 sourcePath=self.path,

617 headText=text,

618 headNode=head_node,

619 nodes=[head_node],

620 )

621

622 def extract_explicit_sid(self, el: markdown.Element) -> str:

623 ch = el.children

624

625 if not ch or ch[-1].type != 'text':

626 return ''

627

628 m = re.match(r'^(.*?):(\S+)$', ch[-1].text)

629 if not m:

630 return ''

631

632 ch[-1].text = m.group(1)

633 markdown.strip_text_content(el)

634

635 return m.group(2)

636

637

638class HTMLGenerator:

639 def __init__(self, b: Builder):

640 self.b = b

641 self.buffers = {}

642 self.content = {}

643

644 def render_section_heads(self):

645 for sec in self.b.sectionMap.values():

646 mr = MarkdownRenderer(self.b, sec)

647 sec.headHtml = mr.render_children(sec.headNode.el)

648 sec.headHtmlLink = f'<a href="{sec.htmlUrl}">{sec.headHtml}</a>'

649

650 def render_sections(self):

651 for sec in self.b.sectionMap.values():

652 if not sec.parentSid:

653 self.render_section(sec.sid)

654

655 def render_section(self, sid):

656 sec = self.b.get_section(sid)

657 if not sec:

658 return

659

660 util.log.debug(f'render {sid!r}')

661

662 mr = MarkdownRenderer(self.b, sec)

663

664 self.add(sec, f'<section id="{sec.htmlId}" data-sid="{sec.sid}">\n')

665

666 for node in sec.nodes:

667 if isinstance(node, MarkdownNode):

668 html = mr.render_element(node.el)

669 self.add(sec, html)

670 continue

671 if isinstance(node, SectionNode):

672 self.render_section(node.sid)

673 continue

674 if isinstance(node, TocNode):

675 entries = ''.join(self.render_toc_entry(sid, node.depth) for sid in node.sids)

676 html = f'<div class="localtoc"><ul>{entries}</ul></div>'

677 self.add(sec, html)

678 continue

679 if isinstance(node, RawHtmlNode):

680 self.add(sec, node.html)

681 continue

682

683 self.add(sec, f'</section>\n')

684

685 def render_toc_entry(self, sid, depth: int):

686 sec = self.b.get_section(sid)

687 if not sec:

688 return

689

690 s = ''

691 if depth > 1:

692 sub = [self.render_toc_entry(s, depth - 1) for s in sec.subSids]

693 if sub:

694 s = '<ul>' + ''.join(sub) + '</ul>'

695

696 return f'<li data-sid="{sid}">{sec.headHtmlLink}{s}</li>'

697

698 def render_main_toc(self):

699 root = self.b.get_section('/')

700 if not root:

701 return

702 return '\n'.join(

703 self.render_toc_entry(sid, 999)

704 for sid in root.subSids

705 )

706

707 def add(self, sec: Section, html):

708 if sec.htmlPath not in self.buffers:

709 self.buffers[sec.htmlPath] = util.Data(sids=[], html=[])

710 self.buffers[sec.htmlPath].sids.append(sec.sid)

711 self.buffers[sec.htmlPath].html.append(html)

712

713 def flush(self):

714 tpl = template.compile(self.b, self.b.options.pageTemplate)

715

716 self.content = {}

717

718 home_url = ''

719 sec = self.b.get_section('/')

720 if sec:

721 home_url = sec.htmlUrl

722

723 for path, buf in self.buffers.items():

724 self.content[path] = template.call(self.b, tpl, {

725 'path': path,

726 'title': self.b.options.title,

727 'subTitle': self.b.options.subTitle,

728 'main': ''.join(buf.html),

729 'breadcrumbs': self.get_breadcrumbs(buf.sids[0]),

730 'home': home_url,

731 'builder': self.b,

732 'options': self.b.options,

733 })

734

735 def write(self):

736 for path, html in self.content.items():

737 util.log.debug(f'write {path!r}')

738 util.write_file(path, html)

739

740 def get_breadcrumbs(self, sid):

741 sec = self.b.get_section(sid)

742 if not sec:

743 return []

744

745 bs = []

746

747 while sec:

748 bs.insert(0, (sec.htmlUrl, sec.headHtml))

749 if not sec.parentSid:

750 break

751 sec = self.b.get_section(sec.parentSid)

752

753 return bs

754

755

756class MarkdownRenderer(markdown.Renderer):

757

758 def __init__(self, b: Builder, sec: Section):

759 self.b = b

760 self.sec = sec

761

762 def link_render(self, el: markdown.Element):

763 c = self.render_children(el)

764 link = el.link

765 if link.startswith(('http:', 'https:')):

766 return self.render_a(link, el.title, c, el)

767 if link.startswith('//'):

768 return self.render_a(link[1:], el.title, c, el)

769

770 sid = self.b.make_sid(link, self.sec.sid)

771 target = self.b.get_section(sid)

772 if not target:

773 return self.render_a(link, el.title, c, el)

774 return self.render_a(

775 target.htmlUrl,

776 el.title or target.headText,

777 c or target.headHtml,

778 el

779 )

780

781 def image_render(self, el: markdown.Element):

782 if not el.src:

783 return ''

784 if el.src.startswith(('http:', 'https:')):

785 return super().image_render(el)

786 paths = [path for path in self.b.assetPaths if path.endswith(el.src)]

787 if not paths:

788 util.log.error(f'asset not found: {el.src!r} ')

789 el.src = ''

790 return super().image_render(el)

791 el.src = self.b.add_asset(paths[0])

792 return super().image_render(el)

793

794 def heading_render(self, el: markdown.Element):

795 sec = self.b.section_from_element(el)

796 if not sec:

797 return

798 c = self.render_children(el)

799 tag = 'h' + str(sec.headLevel)

800 a = {'data-url': sec.htmlUrl}

801 if self.b.options.debug:

802 a['title'] = markdown.escape(sec.sourcePath)

803 return f'<{tag}{markdown.attributes(a)}>{c}</{tag}>\n'