initial commit

This commit is contained in:
mia 2024-07-26 09:36:56 -07:00
commit 81071e8fee
4 changed files with 304 additions and 0 deletions

159
1_graph.py Normal file
View file

@ -0,0 +1,159 @@
import json
import sys
from collections import namedtuple
from functools import cache
from pathlib import Path
import psycopg
try:
import progressbar2 as progressbar
except ImportError:
import progressbar
Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"])
Tree = namedtuple("Tree", ["id", "replies", "renotes"])
print("configuring")
config = {}
exec(Path("config.py").read_text(), config)
conn: psycopg.Connection = config["connect"]()
user_id: str = config["user_id"]
early_exit = config.get("early_exit")
print("fetching note ids", file=sys.stderr)
note_ids = set()
cur = conn.execute(
'select id from note where "userId" = %s and not ("renoteId" is not null and text is null)',
[user_id],
)
while rows := cur.fetchmany(0xFF):
for row in rows:
note_ids.add(row[0])
if early_exit and len(note_ids) > early_exit:
break
@cache
def get_note(id: str) -> Note:
return Note(
*conn.execute(
'select "renoteId", "replyId", "userId" from note where id = %s', [id]
).fetchone()
)
roots = {}
trees = {}
def tree_init(id: str, seek: bool = True) -> Tree:
if tree := trees.get(id):
return tree
tree = Tree(id, [], [])
note = get_note(id)
if note.reply_id or note.renote_id:
if note.reply_id:
p_tree = tree_init(note.reply_id)
p_tree.replies.append(tree)
if note.renote_id:
r_tree = tree_init(note.renote_id, False)
r_tree.renotes.append(tree)
else:
roots[id] = tree
trees[id] = tree
return tree
def make_widgets(msg, trees, roots):
widgets = [
f"{msg} ",
progressbar.Percentage(),
" ",
progressbar.Bar(),
" ",
progressbar.SimpleProgress("%(value_s)s/%(max_value_s)s"),
" ",
]
if trees:
widgets += [progressbar.Variable("trees"), " "]
if roots:
widgets += [progressbar.Variable("roots"), " "]
widgets += [progressbar.ETA()]
return widgets
pb = progressbar.ProgressBar(
0,
len(note_ids),
widgets=make_widgets("building trees", True, True),
)
for note_id in note_ids:
tree_init(note_id)
pb.increment(trees=len(trees), roots=len(roots))
pb.finish()
def traverse(tree: Tree):
note = get_note(tree.id)
if note.user_id == user_id:
expand(tree)
else:
for child in tree.replies:
traverse(child)
def expand(tree: Tree):
for row in conn.execute(
"select id from note_replies(%s, 1, 1000)", [tree.id]
).fetchall():
if row[0] in trees:
continue
note = get_note(row[0])
new = Tree(row[0], [], [])
if note.reply_id == tree.id:
# is a reply
tree.replies.append(new)
trees[row[0]] = new
if note.renote_id == tree.id:
# is a renote
tree.renotes.append(new)
trees[row[0]] = new
for child in tree.replies:
expand(child)
roots_len = len(roots)
pb = progressbar.ProgressBar(
0, roots_len, widgets=make_widgets("expanding roots", True, False)
)
for root in roots.values():
traverse(root)
pb.increment(trees=len(trees))
pb.finish()
with Path("graph.db").open("w") as f:
pb = progressbar.ProgressBar(
0, len(trees), widgets=make_widgets("saving graph", False, False)
)
for key, tree in trees.items():
note = get_note(tree.id)
is_root = tree.id in roots
f.write(f"{tree.id}\t")
f.write(",".join((reply.id for reply in tree.replies)))
f.write(f"\t")
f.write(",".join((renote.id for renote in tree.renotes)))
f.write(f"\t")
flags = []
if tree.id in roots:
flags.append("root")
if note.user_id == user_id:
flags.append("self")
f.write(",".join(flags))
f.write(f"\n")
pb.increment()
pb.finish()

84
2_filter.py Normal file
View file

@ -0,0 +1,84 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, List
import psycopg
from ty import FilterableNote, Visibility
try:
import progressbar2 as progressbar
except ImportError:
import progressbar
print("configuring")
config = {}
exec(Path("config.py").read_text(), config)
conn: psycopg.Connection = config["connect"]()
criteria: Callable[[FilterableNote], bool] = config["criteria"]
intermediate = {}
print("parsing")
for line in Path("graph.db").read_text().splitlines():
id, replies, quotes, flags = line.split("\t")
intermediate[id] = {
"id": id,
"replies": replies.split(",") if len(replies) > 0 else [],
"quotes": quotes.split(",") if len(quotes) > 0 else [],
"flags": flags.split(",") if len(flags) > 0 else [],
}
def transform(entry: dict) -> FilterableNote:
note = conn.execute(
'select "createdAt", reactions, "renoteCount", visibility from note where id = %s',
[entry["id"]],
).fetchone()
if note is None:
return None # part of thread disappeared during processing
when, reactions, renotes, visibility = note
replies = [transform(intermediate[reply]) for reply in entry["replies"]]
quotes = [transform(intermediate[quote]) for quote in entry["quotes"]]
if None in replies or None in quotes:
return None # bubble up, buttercup
return FilterableNote(
entry["id"],
"self" in entry["flags"],
replies,
quotes,
when.astimezone(),
sum(reactions.values()),
renotes,
Visibility.from_db(visibility),
)
root_count = 0
for entry in intermediate.values():
if "root" in entry["flags"]:
root_count += 1
pb = progressbar.ProgressBar(
0,
root_count,
prefix="processing ",
)
targets = []
for entry in intermediate.values():
if "root" not in entry["flags"]:
continue
transformed = transform(entry)
if transformed is None:
continue # we'll get to it next cycle
if criteria(transformed):
targets.append(entry["id"])
pb.increment()
pb.finish()
Path("filtered.list").write_text("\n".join(targets))

0
requirements.txt Normal file
View file

61
ty.py Normal file
View file

@ -0,0 +1,61 @@
from dataclasses import dataclass
from typing import List, Callable
from datetime import datetime
from enum import Enum
class Visibility(Enum):
public = 1
unlisted = 2
followers = 3
direct = 4
@classmethod
def from_db(cls, raw: str) -> "Visibility":
match raw:
case "public": return cls.public
case "home": return cls.unlisted
case "followers": return cls.followers
case "specified": return cls.direct
case _: raise ValueError(f"unknown visibility `{raw}`")
@dataclass
class FilterableNote:
id: str
mine: bool
replies: List["FilterableNote"]
quotes: List["FilterableNote"]
when: datetime
reactions: int
renotes: int
visibility: Visibility
def thread(self) -> List["FilterableNote"]:
acc = []
for reply in self.replies:
acc += reply.thread()
for quote in self.quotes:
acc += quote.thread()
acc.append(self)
return acc
def thread_self(self) -> List["FilterableNote"]:
acc = []
for reply in self.replies:
acc += reply.thread_self()
for quote in self.quotes:
acc += quote.thread_self()
if self.mine:
acc.append(self)
return acc
def to_dict(self):
return {
"id": self.id,
"mine": self.mine,
"replies": [note.to_dict() for note in self.replies],
"quotes": [note.to_dict() for note in self.quotes],
"when": self.when.isoformat(),
"reactions": self.reactions,
"renotes": self.renotes,
}