153 lines
4.7 KiB
Python
Executable File
153 lines
4.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import os
|
|
import re
|
|
|
|
import requests
|
|
import shutil
|
|
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
|
|
from creole import creole2html
|
|
|
|
from markdownify import markdownify
|
|
|
|
|
|
def get_titles(browser) -> list[str]:
|
|
browser.get("https://wiki.mumble.info/wiki/Special:AllPages")
|
|
|
|
page_list = browser.find_elements(By.CLASS_NAME, "mw-allpages-chunk")
|
|
if not page_list:
|
|
raise Exception("ERROR: Could not get page list")
|
|
|
|
pages = page_list[0].find_elements(By.TAG_NAME, "li")
|
|
|
|
titles = []
|
|
|
|
for page in pages:
|
|
if page.get_property("class") == "allpagesredirect":
|
|
continue
|
|
title_element = page.find_elements(By.TAG_NAME, "a")
|
|
titles.append(title_element[0].get_property("title"))
|
|
|
|
return titles
|
|
|
|
|
|
class PageSource:
|
|
def __init__(self, title: str, browser):
|
|
self.title = title
|
|
|
|
edit_link = f"https://wiki.mumble.info/index.php?title={title}&action=edit"
|
|
browser.get(edit_link)
|
|
|
|
source_textbox = browser.find_element(By.ID, "wpTextbox1")
|
|
if not source_textbox:
|
|
raise Exception(f"ERROR: could not find source textbox on page {edit_link}")
|
|
|
|
contents = source_textbox.get_attribute("value")
|
|
if not contents:
|
|
raise Exception(f"ERROR: could not get page source for page {edit_link}")
|
|
|
|
self.contents = contents
|
|
|
|
self.image_urls = []
|
|
|
|
page_link = f"https://wiki.mumble.info/wiki/{title}"
|
|
browser.get(page_link)
|
|
|
|
images = browser.find_elements(By.CLASS_NAME, "image")
|
|
for image in images:
|
|
img_element = image.find_element(By.TAG_NAME, "img")
|
|
img_url = img_element.get_attribute("src")
|
|
title = img_element.get_attribute("alt")
|
|
self.image_urls.append((title, img_url))
|
|
|
|
def as_html(self) -> str:
|
|
html = creole2html(self.contents)
|
|
return html
|
|
|
|
def as_md(self) -> str:
|
|
md = ""
|
|
|
|
find_url = re.compile(r"(?<!\[)\[([^\[]+) (\S+)]")
|
|
find_page_link = re.compile(r"\[\[(?!File:)([^\|]+)\|([^\|]+)]]")
|
|
for line in self.contents.split("\n"):
|
|
|
|
def convert_url(match_obj):
|
|
return f" [{match_obj.group(2)}]({match_obj.group(1)})"
|
|
|
|
def convert_page_link(match_obj):
|
|
return f"[{match_obj.group(2)}]({match_obj.group(1)}.md)"
|
|
|
|
line = re.sub(find_url, convert_url, line)
|
|
line = re.sub(find_page_link, convert_page_link, line)
|
|
|
|
if "Category:" in line:
|
|
continue
|
|
elif "File:" in line:
|
|
regex = r"\[\[File:([^|]+)(\|[^\|\]]+)*]]"
|
|
filename = re.search(regex, line)
|
|
if filename:
|
|
filename = filename.group(1).replace(" ", "_").lower()
|
|
else:
|
|
raise Exception("No filename")
|
|
md += f""
|
|
elif "{{" in line and "}}" in line:
|
|
md += line.replace("{{", "# ").replace("}}", "")
|
|
elif "===" in line:
|
|
md += line.replace("===", "###", 1).replace("===", "")
|
|
elif "==" in line:
|
|
md += line.replace("==", "##", 1).replace("==", "")
|
|
elif "br style" in line:
|
|
continue
|
|
else:
|
|
md += line
|
|
md += "\n"
|
|
|
|
return md
|
|
|
|
|
|
def convert_markup_to_html(sources: list[PageSource]):
|
|
os.makedirs("output", exist_ok=True)
|
|
for source in sources:
|
|
filename_base = f"output/{source.title}.test"
|
|
os.makedirs(os.path.dirname(filename_base), exist_ok=True)
|
|
with open(f"output/{source.title}.txt", "w") as f:
|
|
f.write(source.contents)
|
|
with open(f"output/{source.title}.html", "w") as f:
|
|
f.write(source.as_html())
|
|
with open(f"output/{source.title}.md", "w") as f:
|
|
f.write(source.as_md())
|
|
|
|
|
|
def download_files(sources: list[PageSource]):
|
|
os.makedirs("output/images", exist_ok=True)
|
|
for source in sources:
|
|
for i in source.image_urls:
|
|
r = requests.get(i[1], stream=True)
|
|
filename = i[0].replace(" ", "_").lower()
|
|
if not filename:
|
|
continue
|
|
if "url=" in filename:
|
|
continue
|
|
if r.status_code == 200:
|
|
with open(f"output/images/{filename}", "wb") as f:
|
|
r.raw.decode_content = True
|
|
shutil.copyfileobj(r.raw, f)
|
|
|
|
|
|
def main():
|
|
browser = webdriver.Firefox()
|
|
|
|
links = get_titles(browser)
|
|
|
|
sources = [PageSource(link, browser) for link in links]
|
|
convert_markup_to_html(sources)
|
|
|
|
download_files(sources)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|