mumble-wiki-crawler/main.py

#!/usr/bin/env python3

import os
import re

import requests
import shutil

from selenium import webdriver
from selenium.webdriver.common.by import By

from creole import creole2html

from markdownify import markdownify


def get_titles(browser) -> list[str]:
    browser.get("https://wiki.mumble.info/wiki/Special:AllPages")

    page_list = browser.find_elements(By.CLASS_NAME, "mw-allpages-chunk")
    if not page_list:
        raise Exception("ERROR: Could not get page list")

    pages = page_list[0].find_elements(By.TAG_NAME, "li")

    titles = []

    for page in pages:
        if page.get_property("class") == "allpagesredirect":
            continue
        title_element = page.find_elements(By.TAG_NAME, "a")
        titles.append(title_element[0].get_property("title"))

    return titles


class PageSource:
    def __init__(self, title: str, browser):
        self.title = title

        edit_link = f"https://wiki.mumble.info/index.php?title={title}&action=edit"
        browser.get(edit_link)

        source_textbox = browser.find_element(By.ID, "wpTextbox1")
        if not source_textbox:
            raise Exception(f"ERROR: could not find source textbox on page {edit_link}")

        contents = source_textbox.get_attribute("value")
        if not contents:
            raise Exception(f"ERROR: could not get page source for page {edit_link}")

        self.contents = contents

        self.image_urls = []

        page_link = f"https://wiki.mumble.info/wiki/{title}"
        browser.get(page_link)

        images = browser.find_elements(By.CLASS_NAME, "image")
        for image in images:
            img_element = image.find_element(By.TAG_NAME, "img")
            img_url = img_element.get_attribute("src")
            title = img_element.get_attribute("alt")
            self.image_urls.append((title, img_url))

    def as_html(self) -> str:
        html = creole2html(self.contents)
        return html

    def as_md(self) -> str:
        md = ""

        find_url = re.compile(r"(?<!\[)\[([^\[]+) (\S+)]")
        find_page_link = re.compile(r"\[\[(?!File:)([^\|]+)\|([^\|]+)]]")
        for line in self.contents.split("\n"):

            def convert_url(match_obj):
                return f" [{match_obj.group(2)}]({match_obj.group(1)})"

            def convert_page_link(match_obj):
                return f"[{match_obj.group(2)}]({match_obj.group(1)}.md)"

            line = re.sub(find_url, convert_url, line)
            line = re.sub(find_page_link, convert_page_link, line)

            if "Category:" in line:
                continue
            elif "File:" in line:
                regex = r"\[\[File:([^|]+)(\|[^\|\]]+)*]]"
                filename = re.search(regex, line)
                if filename:
                    filename = filename.group(1).replace(" ", "_").lower()
                else:
                    raise Exception("No filename")
                md += f"![Image]({filename})"
            elif "{{" in line and "}}" in line:
                md += line.replace("{{", "# ").replace("}}", "")
            elif "===" in line:
                md += line.replace("===", "###", 1).replace("===", "")
            elif "==" in line:
                md += line.replace("==", "##", 1).replace("==", "")
            elif "br style" in line:
                continue
            else:
                md += line
            md += "\n"

        return md


def convert_markup_to_html(sources: list[PageSource]):
    os.makedirs("output", exist_ok=True)
    for source in sources:
        filename_base = f"output/{source.title}.test"
        os.makedirs(os.path.dirname(filename_base), exist_ok=True)
        with open(f"output/{source.title}.txt", "w") as f:
            f.write(source.contents)
        with open(f"output/{source.title}.html", "w") as f:
            f.write(source.as_html())
        with open(f"output/{source.title}.md", "w") as f:
            f.write(source.as_md())


def download_files(sources: list[PageSource]):
    os.makedirs("output/images", exist_ok=True)
    for source in sources:
        for i in source.image_urls:
            r = requests.get(i[1], stream=True)
            filename = i[0].replace(" ", "_").lower()
            if not filename:
                continue
            if "url=" in filename:
                continue
            if r.status_code == 200:
                with open(f"output/images/{filename}", "wb") as f:
                    r.raw.decode_content = True
                    shutil.copyfileobj(r.raw, f)


def main():
    browser = webdriver.Firefox()

    links = get_titles(browser)

    sources = [PageSource(link, browser) for link in links]
    convert_markup_to_html(sources)

    download_files(sources)


if __name__ == "__main__":
    main()