#!/usr/bin/env python3 import os import re import requests import shutil from selenium import webdriver from selenium.webdriver.common.by import By from creole import creole2html from markdownify import markdownify def get_titles(browser) -> list[str]: browser.get("https://wiki.mumble.info/wiki/Special:AllPages") page_list = browser.find_elements(By.CLASS_NAME, "mw-allpages-chunk") if not page_list: raise Exception("ERROR: Could not get page list") pages = page_list[0].find_elements(By.TAG_NAME, "li") titles = [] for page in pages: if page.get_property("class") == "allpagesredirect": continue title_element = page.find_elements(By.TAG_NAME, "a") titles.append(title_element[0].get_property("title")) return titles class PageSource: def __init__(self, title: str, browser): self.title = title edit_link = f"https://wiki.mumble.info/index.php?title={title}&action=edit" browser.get(edit_link) source_textbox = browser.find_element(By.ID, "wpTextbox1") if not source_textbox: raise Exception(f"ERROR: could not find source textbox on page {edit_link}") contents = source_textbox.get_attribute("value") if not contents: raise Exception(f"ERROR: could not get page source for page {edit_link}") self.contents = contents self.image_urls = [] page_link = f"https://wiki.mumble.info/wiki/{title}" browser.get(page_link) images = browser.find_elements(By.CLASS_NAME, "image") for image in images: img_element = image.find_element(By.TAG_NAME, "img") img_url = img_element.get_attribute("src") title = img_element.get_attribute("alt") self.image_urls.append((title, img_url)) def as_html(self) -> str: html = creole2html(self.contents) return html def as_md(self) -> str: md = "" find_url = re.compile(r"(?