diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2caa084 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/__pycache__/ +/graph.db +/filtered.list +/out/ +/sec.py diff --git a/1_graph.py b/1_graph.py index bc8116c..824d723 100644 --- a/1_graph.py +++ b/1_graph.py @@ -3,24 +3,20 @@ import sys from collections import namedtuple from functools import cache from pathlib import Path +from typing import Optional import psycopg -try: - import progressbar2 as progressbar -except ImportError: - import progressbar +from com import eval_config, progressbar Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"]) Tree = namedtuple("Tree", ["id", "replies", "renotes"]) -print("configuring") -config = {} -exec(Path("config.py").read_text(), config) +config = eval_config() conn: psycopg.Connection = config["connect"]() user_id: str = config["user_id"] -early_exit = config.get("early_exit") +early_exit: Optional[int] = config.get("early_exit") print("fetching note ids", file=sys.stderr) diff --git a/2_filter.py b/2_filter.py index 816e762..8e77945 100644 --- a/2_filter.py +++ b/2_filter.py @@ -4,32 +4,14 @@ from typing import Callable, List import psycopg -from ty import FilterableNote, Visibility - -try: - import progressbar2 as progressbar -except ImportError: - import progressbar +from com import FilterableNote, Visibility, eval_config, parse_graph, progressbar -print("configuring") -config = {} -exec(Path("config.py").read_text(), config) +config = eval_config() conn: psycopg.Connection = config["connect"]() criteria: Callable[[FilterableNote], bool] = config["criteria"] -intermediate = {} - -print("parsing") -for line in Path("graph.db").read_text().splitlines(): - id, replies, quotes, flags = line.split("\t") - intermediate[id] = { - "id": id, - "replies": replies.split(",") if len(replies) > 0 else [], - "quotes": quotes.split(",") if len(quotes) > 0 else [], - "flags": flags.split(",") if len(flags) > 0 else [], - } - +intermediate = parse_graph() def transform(entry: dict) -> FilterableNote: note = conn.execute( diff --git a/3_archive.py b/3_archive.py new file mode 100644 index 0000000..6eef0e1 --- /dev/null +++ b/3_archive.py @@ -0,0 +1,144 @@ +import json +from http.client import HTTPResponse +from pathlib import Path +from shutil import copyfileobj +from urllib.request import urlopen + +import brotli +import msgpack +import psycopg + +from com import Visibility, eval_config, parse_graph, progressbar + +config = eval_config() +conn: psycopg.Connection = config["connect"]() + +graph = parse_graph() +print("reading filterlist") +filtered = Path("filtered.list").read_text().strip().splitlines() + +collected_users = {} +def collect_user(id: str): + if id in collected_users: + return + user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone() + if user is None: + return None + username, host, avatar_url = user + profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone() + description, fields = profile or ("", []) + + output = {} + output["id"] = id + output["username"] = username + output["host"] = host + output["description"] = description + output["fields"] = fields + output["avatar_url"] = avatar_url + + collected_users[id] = output + +collected_notes = [] +files_to_collect = [] +def collect_note(id: str): + output = {} + output["id"] = id + + note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone() + if note is None: + return None + text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note + collect_user(user_id) + + output["text"] = text + output["user_id"] = user_id + output["created_at"] = created_at.astimezone(tz=None).isoformat() + output["updated_at"] = None + if updated_at is not None: + output["updated_at"] = updated_at.astimezone(tz=None).isoformat() + output["reactions"] = reactions + output["renotes"] = renotes + output["visibility"] = Visibility.from_db(visibility).code() + + node = graph[id] + replies = [collect_note(reply) for reply in node["replies"]] + replies = filter(lambda reply: reply is not None, replies) + quotes = [collect_note(quote) for quote in node["quotes"]] + quotes = filter(lambda quote: quote is not None, quotes) + + output["attachments"] = [] + for file_id in file_ids: + name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone() + attachment = { + "id": file_id, + "type": type_, + "comment": comment, + } + if "self" in node["flags"]: # archive own attachments + files_to_collect.append((file_id, url)) + attachment["url"] = None + else: + attachment["url"] = url + + output["replies"] = list(replies) + output["quotes"] = list(quotes) + + if len(output["attachments"]) == 0: del output["attachments"] + if len(output["replies"]) == 0: del output["replies"] + if len(output["quotes"]) == 0: del output["quotes"] + + return output + +pb = progressbar.ProgressBar( + 0, + len(filtered), + prefix="collecting data ", +) +for id in filtered: + note = collect_note(id) + collected_notes.append((id, note)) + pb.increment() +pb.finish() + +outdir = Path("out") +if not outdir.exists(): + outdir.mkdir() +if not (outdir / "note").exists(): + (outdir / "note").mkdir() +if not (outdir / "user").exists(): + (outdir / "user").mkdir() +if not (outdir / "file").exists(): + (outdir / "file").mkdir() + +pb = progressbar.ProgressBar( + 0, + len(collected_notes) + len(collected_users), + prefix="writing data ", +) + +for id, note in collected_notes: + outfile = outdir / "note" / f"{id}.mpk.br" + with outfile.open("wb") as f: + f.write(brotli.compress(msgpack.dumps(note))) + pb.increment() + +for id, user in collected_users.items(): + outfile = outdir / "user" / f"{id}.mpk.br" + with outfile.open("wb") as f: + f.write(brotli.compress(msgpack.dumps(note))) + pb.increment() +pb.finish() + +pb = progressbar.ProgressBar( + 0, + len(files_to_collect), + prefix="downloading attachments ", +) +for (id, url) in files_to_collect: + outfile = outdir / "file" / id + response: HTTPResponse = urlopen(url) + with outfile.open("wb") as f: + copyfileobj(response, f) + response.close() + pb.increment() +pb.finish() diff --git a/4_delete.py b/4_delete.py new file mode 100644 index 0000000..51e1ef3 --- /dev/null +++ b/4_delete.py @@ -0,0 +1,33 @@ +from pathlib import Path + +import httpx +import psycopg + +from com import eval_config, parse_graph, progressbar + +config = eval_config() +conn: psycopg.Connection = config["connect"]() +token: str = config["token"] +api: str = config["api"] + +graph = parse_graph() +print("reading filterlist") +filtered = Path("filtered.list").read_text().strip().splitlines() + +queue = [] + +def enqueue(note): + for reply in note["replies"]: + enqueue(graph[reply]) + for quote in note["quotes"]: + enqueue(graph[quote]) + if "self" in note["flags"]: + files = conn.execute('select "fileIds" from note where id = %s', [note["id"]]).fetchone()[0] + queue.append((note["id"], files)) + +for id in filtered: + enqueue(graph[id]) + +print(queue) + +# client = httpx.Client() diff --git a/ty.py b/com.py similarity index 62% rename from ty.py rename to com.py index e17c046..4ceb849 100644 --- a/ty.py +++ b/com.py @@ -1,7 +1,15 @@ +import sys from dataclasses import dataclass -from typing import List, Callable from datetime import datetime from enum import Enum +from pathlib import Path +from typing import Callable, Dict, List + +try: + import progressbar2 as progressbar +except ImportError: + import progressbar + class Visibility(Enum): public = 1 @@ -17,6 +25,13 @@ class Visibility(Enum): case "followers": return cls.followers case "specified": return cls.direct case _: raise ValueError(f"unknown visibility `{raw}`") + + def code(self) -> str: + match self: + case self.public: return "p" + case self.unlisted: return "u" + case self.followers: return "f" + case self.direct: return "d" @dataclass @@ -59,3 +74,24 @@ class FilterableNote: "reactions": self.reactions, "renotes": self.renotes, } + + +def eval_config() -> dict: + print("configuring") + config = {} + exec(Path(sys.argv[1]).read_text(), config) + return config + + +def parse_graph() -> Dict[str, dict]: + print("parsing graph") + graph = {} + for line in Path("graph.db").read_text().splitlines(): + id, replies, quotes, flags = line.split("\t") + graph[id] = { + "id": id, + "replies": replies.split(",") if len(replies) > 0 else [], + "quotes": quotes.split(",") if len(quotes) > 0 else [], + "flags": flags.split(",") if len(flags) > 0 else [], + } + return graph diff --git a/conf_mia.py b/conf_mia.py new file mode 100644 index 0000000..6496e3b --- /dev/null +++ b/conf_mia.py @@ -0,0 +1,46 @@ +import math +from datetime import UTC, datetime, timedelta + +from com import FilterableNote, Visibility +from sec import connect, tokens + +user_id = "9gf2ev4ex5dflllo" +token = tokens["mia"] +api = "https://void.rehab/api/" +early_exit = 0xFFF + +now = datetime.now(UTC) +threshold = 0.1 + +def criteria(root: FilterableNote) -> bool: + thread = root.thread() + thread_self = root.thread_self() + + # if there are dms involved... + low_vis = min(thread, key=lambda note: note.visibility.value) + if low_vis.visibility == Visibility.direct: + is_direct = lambda note: note.visibility == Visibility.direct + most_recent_direct = max(filter(is_direct, thread), key=lambda note: note.when) + # ...and the dms are younger than two months... + if now - most_recent_direct.when < timedelta(days=30 * 2): + # ...do not delete the thread + return False + + # get the most recent post... + others_recency = max(thread, key=lambda note: note.when) + # ...and bail if it's too new + if now - others_recency.when < timedelta(days=14): + return False + + # get my... + most_recent_post = max(thread_self, key=lambda note: note.when) # ...most recent post... + score = lambda note: note.reactions + note.renotes*5 + high_score_post = max(thread_self, key=score) # ...highest scoring post... + # ...and their values... + most_recent = most_recent_post.when + most_recent_age = now - most_recent + high_score = score(high_score_post) + # ...weigh it... + weighted_score = high_score / math.sqrt(most_recent_age.days) + # ...and check it against a threshold + return weighted_score < threshold diff --git a/conf_pain.py b/conf_pain.py new file mode 100644 index 0000000..85e7095 --- /dev/null +++ b/conf_pain.py @@ -0,0 +1,14 @@ +import math +from datetime import UTC, datetime, timedelta + +from com import FilterableNote +from sec import connect, tokens + +user_id = "9gszslkcdfnomssj" +token = tokens["pain"] +api = "https://void.rehab/api/" + +def criteria(root: FilterableNote) -> bool: + # if it's more than two months old, delete + # return (datetime.now(UTC) - root.when).days > 60 + return (datetime.now(UTC) - root.when).days > (12 * 30) diff --git a/go.sh b/go.sh new file mode 100755 index 0000000..39f3779 --- /dev/null +++ b/go.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +set -ex + +test -f graph.db && rm graph.db +test -f filtered.list && rm filtered.list +test -d out && rm -r out +python3 1_graph.py conf_$1.py +python3 2_filter.py conf_$1.py +# python3 3_archive.py conf_$1.py +# echo uploading to memorial +# rsync -r -e 'ssh -p23' --progress out/ memorial:fediverse/$1/ +# python3 4_delete.py conf_$1.py diff --git a/proxy.sh b/proxy.sh new file mode 100755 index 0000000..9628fab --- /dev/null +++ b/proxy.sh @@ -0,0 +1,2 @@ +#!/bin/sh +exec ssh -NL 5432:localhost:5432 vr diff --git a/requirements.txt b/requirements.txt index e69de29..094393e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,5 @@ +httpx +progressbar2 +psycopg +brotli +msgpack