scrubber/3_archive.py

145 lines
4.2 KiB
Python
Raw Normal View History

2024-09-04 05:47:13 -06:00
import json
from http.client import HTTPResponse
from pathlib import Path
from shutil import copyfileobj
from urllib.request import urlopen
import brotli
import msgpack
import psycopg
from com import Visibility, eval_config, parse_graph, progressbar
config = eval_config()
conn: psycopg.Connection = config["connect"]()
graph = parse_graph()
print("reading filterlist")
filtered = Path("filtered.list").read_text().strip().splitlines()
collected_users = {}
def collect_user(id: str):
if id in collected_users:
return
user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone()
if user is None:
return None
username, host, avatar_url = user
profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone()
description, fields = profile or ("", [])
output = {}
output["id"] = id
output["username"] = username
output["host"] = host
output["description"] = description
output["fields"] = fields
output["avatar_url"] = avatar_url
collected_users[id] = output
collected_notes = []
files_to_collect = []
def collect_note(id: str):
output = {}
output["id"] = id
note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone()
if note is None:
return None
text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note
collect_user(user_id)
output["text"] = text
output["user_id"] = user_id
output["created_at"] = created_at.astimezone(tz=None).isoformat()
output["updated_at"] = None
if updated_at is not None:
output["updated_at"] = updated_at.astimezone(tz=None).isoformat()
output["reactions"] = reactions
output["renotes"] = renotes
output["visibility"] = Visibility.from_db(visibility).code()
node = graph[id]
replies = [collect_note(reply) for reply in node["replies"]]
replies = filter(lambda reply: reply is not None, replies)
quotes = [collect_note(quote) for quote in node["quotes"]]
quotes = filter(lambda quote: quote is not None, quotes)
output["attachments"] = []
for file_id in file_ids:
name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone()
attachment = {
"id": file_id,
"type": type_,
"comment": comment,
}
if "self" in node["flags"]: # archive own attachments
files_to_collect.append((file_id, url))
attachment["url"] = None
else:
attachment["url"] = url
output["replies"] = list(replies)
output["quotes"] = list(quotes)
if len(output["attachments"]) == 0: del output["attachments"]
if len(output["replies"]) == 0: del output["replies"]
if len(output["quotes"]) == 0: del output["quotes"]
return output
pb = progressbar.ProgressBar(
0,
len(filtered),
prefix="collecting data ",
)
for id in filtered:
note = collect_note(id)
collected_notes.append((id, note))
pb.increment()
pb.finish()
outdir = Path("out")
if not outdir.exists():
outdir.mkdir()
if not (outdir / "note").exists():
(outdir / "note").mkdir()
if not (outdir / "user").exists():
(outdir / "user").mkdir()
if not (outdir / "file").exists():
(outdir / "file").mkdir()
pb = progressbar.ProgressBar(
0,
len(collected_notes) + len(collected_users),
prefix="writing data ",
)
for id, note in collected_notes:
outfile = outdir / "note" / f"{id}.mpk.br"
with outfile.open("wb") as f:
f.write(brotli.compress(msgpack.dumps(note)))
pb.increment()
for id, user in collected_users.items():
outfile = outdir / "user" / f"{id}.mpk.br"
with outfile.open("wb") as f:
f.write(brotli.compress(msgpack.dumps(note)))
pb.increment()
pb.finish()
pb = progressbar.ProgressBar(
0,
len(files_to_collect),
prefix="downloading attachments ",
)
for (id, url) in files_to_collect:
outfile = outdir / "file" / id
response: HTTPResponse = urlopen(url)
with outfile.open("wb") as f:
copyfileobj(response, f)
response.close()
pb.increment()
pb.finish()