this is a hack

2023-03-10 19:20:18 -07:00
commit 233559f8af
3 changed files with 578 additions and 0 deletions
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+import os
+import re
+
+import requests
+import shutil
+
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+
+from creole import creole2html
+
+from markdownify import markdownify
+
+
+def get_titles(browser) -> list[str]:
+    browser.get("https://wiki.mumble.info/wiki/Special:AllPages")
+
+    page_list = browser.find_elements(By.CLASS_NAME, "mw-allpages-chunk")
+    if not page_list:
+        raise Exception("ERROR: Could not get page list")
+
+    pages = page_list[0].find_elements(By.TAG_NAME, "li")
+
+    titles = []
+
+    for page in pages:
+        if page.get_property("class") == "allpagesredirect":
+            continue
+        title_element = page.find_elements(By.TAG_NAME, "a")
+        titles.append(title_element[0].get_property("title"))
+
+    return titles
+
+
+class PageSource:
+    def __init__(self, title: str, browser):
+        self.title = title
+
+        edit_link = f"https://wiki.mumble.info/index.php?title={title}&action=edit"
+        browser.get(edit_link)
+
+        source_textbox = browser.find_element(By.ID, "wpTextbox1")
+        if not source_textbox:
+            raise Exception(f"ERROR: could not find source textbox on page {edit_link}")
+
+        contents = source_textbox.get_attribute("value")
+        if not contents:
+            raise Exception(f"ERROR: could not get page source for page {edit_link}")
+
+        self.contents = contents
+
+        self.image_urls = []
+
+        page_link = f"https://wiki.mumble.info/wiki/{title}"
+        browser.get(page_link)
+
+        images = browser.find_elements(By.CLASS_NAME, "image")
+        for image in images:
+            img_element = image.find_element(By.TAG_NAME, "img")
+            img_url = img_element.get_attribute("src")
+            title = img_element.get_attribute("alt")
+            self.image_urls.append((title, img_url))
+
+    def as_html(self) -> str:
+        html = creole2html(self.contents)
+        return html
+
+    def as_md(self) -> str:
+        md = ""
+
+        find_url = re.compile(r"(?<!\[)\[([^\[]+) (\S+)]")
+        find_page_link = re.compile(r"\[\[(?!File:)([^\|]+)\|([^\|]+)]]")
+        for line in self.contents.split("\n"):
+
+            def convert_url(match_obj):
+                return f" [{match_obj.group(2)}]({match_obj.group(1)})"
+
+            def convert_page_link(match_obj):
+                return f"[{match_obj.group(2)}]({match_obj.group(1)}.md)"
+
+            line = re.sub(find_url, convert_url, line)
+            line = re.sub(find_page_link, convert_page_link, line)
+
+            if "Category:" in line:
+                continue
+            elif "File:" in line:
+                regex = r"\[\[File:([^|]+)(\|[^\|\]]+)*]]"
+                filename = re.search(regex, line)
+                if filename:
+                    filename = filename.group(1).replace(" ", "_").lower()
+                else:
+                    raise Exception("No filename")
+                md += f"![Image]({filename})"
+            elif "{{" in line and "}}" in line:
+                md += line.replace("{{", "# ").replace("}}", "")
+            elif "===" in line:
+                md += line.replace("===", "###", 1).replace("===", "")
+            elif "==" in line:
+                md += line.replace("==", "##", 1).replace("==", "")
+            elif "br style" in line:
+                continue
+            else:
+                md += line
+            md += "\n"
+
+        return md
+
+
+def convert_markup_to_html(sources: list[PageSource]):
+    os.makedirs("output", exist_ok=True)
+    for source in sources:
+        filename_base = f"output/{source.title}.test"
+        os.makedirs(os.path.dirname(filename_base), exist_ok=True)
+        with open(f"output/{source.title}.txt", "w") as f:
+            f.write(source.contents)
+        with open(f"output/{source.title}.html", "w") as f:
+            f.write(source.as_html())
+        with open(f"output/{source.title}.md", "w") as f:
+            f.write(source.as_md())
+
+
+def download_files(sources: list[PageSource]):
+    os.makedirs("output/images", exist_ok=True)
+    for source in sources:
+        for i in source.image_urls:
+            r = requests.get(i[1], stream=True)
+            filename = i[0].replace(" ", "_").lower()
+            if not filename:
+                continue
+            if "url=" in filename:
+                continue
+            if r.status_code == 200:
+                with open(f"output/images/{filename}", "wb") as f:
+                    r.raw.decode_content = True
+                    shutil.copyfileobj(r.raw, f)
+
+
+def main():
+    browser = webdriver.Firefox()
+
+    links = get_titles(browser)
+
+    sources = [PageSource(link, browser) for link in links]
+    convert_markup_to_html(sources)
+
+    download_files(sources)
+
+
+if __name__ == "__main__":
+    main()