Files
mumble-wiki-crawler/main.py
T
2023-03-10 19:20:18 -07:00

153 lines
4.7 KiB
Python
Executable File

#!/usr/bin/env python3
import os
import re
import requests
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from creole import creole2html
from markdownify import markdownify
def get_titles(browser) -> list[str]:
browser.get("https://wiki.mumble.info/wiki/Special:AllPages")
page_list = browser.find_elements(By.CLASS_NAME, "mw-allpages-chunk")
if not page_list:
raise Exception("ERROR: Could not get page list")
pages = page_list[0].find_elements(By.TAG_NAME, "li")
titles = []
for page in pages:
if page.get_property("class") == "allpagesredirect":
continue
title_element = page.find_elements(By.TAG_NAME, "a")
titles.append(title_element[0].get_property("title"))
return titles
class PageSource:
def __init__(self, title: str, browser):
self.title = title
edit_link = f"https://wiki.mumble.info/index.php?title={title}&action=edit"
browser.get(edit_link)
source_textbox = browser.find_element(By.ID, "wpTextbox1")
if not source_textbox:
raise Exception(f"ERROR: could not find source textbox on page {edit_link}")
contents = source_textbox.get_attribute("value")
if not contents:
raise Exception(f"ERROR: could not get page source for page {edit_link}")
self.contents = contents
self.image_urls = []
page_link = f"https://wiki.mumble.info/wiki/{title}"
browser.get(page_link)
images = browser.find_elements(By.CLASS_NAME, "image")
for image in images:
img_element = image.find_element(By.TAG_NAME, "img")
img_url = img_element.get_attribute("src")
title = img_element.get_attribute("alt")
self.image_urls.append((title, img_url))
def as_html(self) -> str:
html = creole2html(self.contents)
return html
def as_md(self) -> str:
md = ""
find_url = re.compile(r"(?<!\[)\[([^\[]+) (\S+)]")
find_page_link = re.compile(r"\[\[(?!File:)([^\|]+)\|([^\|]+)]]")
for line in self.contents.split("\n"):
def convert_url(match_obj):
return f" [{match_obj.group(2)}]({match_obj.group(1)})"
def convert_page_link(match_obj):
return f"[{match_obj.group(2)}]({match_obj.group(1)}.md)"
line = re.sub(find_url, convert_url, line)
line = re.sub(find_page_link, convert_page_link, line)
if "Category:" in line:
continue
elif "File:" in line:
regex = r"\[\[File:([^|]+)(\|[^\|\]]+)*]]"
filename = re.search(regex, line)
if filename:
filename = filename.group(1).replace(" ", "_").lower()
else:
raise Exception("No filename")
md += f"![Image]({filename})"
elif "{{" in line and "}}" in line:
md += line.replace("{{", "# ").replace("}}", "")
elif "===" in line:
md += line.replace("===", "###", 1).replace("===", "")
elif "==" in line:
md += line.replace("==", "##", 1).replace("==", "")
elif "br style" in line:
continue
else:
md += line
md += "\n"
return md
def convert_markup_to_html(sources: list[PageSource]):
os.makedirs("output", exist_ok=True)
for source in sources:
filename_base = f"output/{source.title}.test"
os.makedirs(os.path.dirname(filename_base), exist_ok=True)
with open(f"output/{source.title}.txt", "w") as f:
f.write(source.contents)
with open(f"output/{source.title}.html", "w") as f:
f.write(source.as_html())
with open(f"output/{source.title}.md", "w") as f:
f.write(source.as_md())
def download_files(sources: list[PageSource]):
os.makedirs("output/images", exist_ok=True)
for source in sources:
for i in source.image_urls:
r = requests.get(i[1], stream=True)
filename = i[0].replace(" ", "_").lower()
if not filename:
continue
if "url=" in filename:
continue
if r.status_code == 200:
with open(f"output/images/{filename}", "wb") as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
def main():
browser = webdriver.Firefox()
links = get_titles(browser)
sources = [PageSource(link, browser) for link in links]
convert_markup_to_html(sources)
download_files(sources)
if __name__ == "__main__":
main()