scrubber/3_archive.py

157 lines
4.6 KiB
Python
Raw Normal View History

2024-09-04 05:47:13 -06:00
import json
2024-10-04 16:43:40 -06:00
import time
2024-09-04 05:47:13 -06:00
from http.client import HTTPResponse
from pathlib import Path
from shutil import copyfileobj
from urllib.request import urlopen
import brotli
import msgpack
import psycopg
from com import Visibility, eval_config, parse_graph, progressbar
config = eval_config()
conn: psycopg.Connection = config["connect"]()
graph = parse_graph()
print("reading filterlist")
filtered = Path("filtered.list").read_text().strip().splitlines()
2024-10-04 16:43:40 -06:00
filtered = list(map(lambda line: line.split(' ')[0], filtered))
2024-09-04 05:47:13 -06:00
collected_users = {}
def collect_user(id: str):
if id in collected_users:
return
2024-10-04 16:43:40 -06:00
time.sleep(0.001)
2024-09-04 05:47:13 -06:00
user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone()
if user is None:
return None
username, host, avatar_url = user
profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone()
description, fields = profile or ("", [])
output = {}
output["id"] = id
output["username"] = username
output["host"] = host
output["description"] = description
output["fields"] = fields
output["avatar_url"] = avatar_url
collected_users[id] = output
collected_notes = []
files_to_collect = []
def collect_note(id: str):
output = {}
output["id"] = id
2024-10-04 16:43:40 -06:00
time.sleep(0.001)
note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds", cw from note where id = %s', [id]).fetchone()
2024-09-04 05:47:13 -06:00
if note is None:
return None
2024-10-04 16:43:40 -06:00
text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids, cw = note
2024-09-04 05:47:13 -06:00
collect_user(user_id)
output["text"] = text
output["user_id"] = user_id
output["created_at"] = created_at.astimezone(tz=None).isoformat()
output["updated_at"] = None
if updated_at is not None:
output["updated_at"] = updated_at.astimezone(tz=None).isoformat()
output["reactions"] = reactions
output["renotes"] = renotes
output["visibility"] = Visibility.from_db(visibility).code()
2024-10-04 16:43:40 -06:00
output["cw"] = cw
2024-09-04 05:47:13 -06:00
node = graph[id]
replies = [collect_note(reply) for reply in node["replies"]]
replies = filter(lambda reply: reply is not None, replies)
quotes = [collect_note(quote) for quote in node["quotes"]]
quotes = filter(lambda quote: quote is not None, quotes)
output["attachments"] = []
for file_id in file_ids:
2024-10-04 16:43:40 -06:00
time.sleep(0.0005)
2024-11-20 07:52:59 -07:00
row = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone()
if row is None:
continue
name, type_, comment, url = row
2024-09-04 05:47:13 -06:00
attachment = {
"id": file_id,
"type": type_,
"comment": comment,
}
if "self" in node["flags"]: # archive own attachments
files_to_collect.append((file_id, url))
attachment["url"] = None
else:
attachment["url"] = url
output["replies"] = list(replies)
output["quotes"] = list(quotes)
if len(output["attachments"]) == 0: del output["attachments"]
if len(output["replies"]) == 0: del output["replies"]
if len(output["quotes"]) == 0: del output["quotes"]
return output
pb = progressbar.ProgressBar(
0,
len(filtered),
prefix="collecting data ",
)
for id in filtered:
note = collect_note(id)
collected_notes.append((id, note))
pb.increment()
pb.finish()
outdir = Path("out")
if not outdir.exists():
outdir.mkdir()
if not (outdir / "note").exists():
(outdir / "note").mkdir()
if not (outdir / "user").exists():
(outdir / "user").mkdir()
if not (outdir / "file").exists():
(outdir / "file").mkdir()
pb = progressbar.ProgressBar(
0,
len(collected_notes) + len(collected_users),
prefix="writing data ",
)
for id, note in collected_notes:
2024-10-04 16:43:40 -06:00
outfile = outdir / "note" / id[:3] / f"{id[3:]}.mpk.br"
outfile.parent.mkdir(exist_ok=True)
2024-09-04 05:47:13 -06:00
with outfile.open("wb") as f:
f.write(brotli.compress(msgpack.dumps(note)))
pb.increment()
for id, user in collected_users.items():
2024-10-04 16:43:40 -06:00
outfile = outdir / "user" / id[:2] / f"{id[2:]}.mpk.br"
outfile.parent.mkdir(exist_ok=True)
2024-09-04 05:47:13 -06:00
with outfile.open("wb") as f:
f.write(brotli.compress(msgpack.dumps(note)))
pb.increment()
pb.finish()
pb = progressbar.ProgressBar(
0,
len(files_to_collect),
prefix="downloading attachments ",
)
2024-10-04 16:43:40 -06:00
for (id, url) in files_to_collect:
outfile = outdir / "file" / id[:2] / id[2:]
outfile.parent.mkdir(exist_ok=True)
2024-09-04 05:47:13 -06:00
response: HTTPResponse = urlopen(url)
with outfile.open("wb") as f:
copyfileobj(response, f)
response.close()
pb.increment()
pb.finish()