scrubber/1_graph.py

160 lines
3.9 KiB
Python
Raw Permalink Normal View History

2024-07-26 10:36:56 -06:00
import json
import sys
2024-10-04 16:43:40 -06:00
import time
2024-07-26 10:36:56 -06:00
from collections import namedtuple
from functools import cache
from pathlib import Path
2024-09-04 05:47:13 -06:00
from typing import Optional
2024-07-26 10:36:56 -06:00
import psycopg
2024-09-04 05:47:13 -06:00
from com import eval_config, progressbar
2024-07-26 10:36:56 -06:00
Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"])
Tree = namedtuple("Tree", ["id", "replies", "renotes"])
2024-09-04 05:47:13 -06:00
config = eval_config()
2024-07-26 10:36:56 -06:00
conn: psycopg.Connection = config["connect"]()
user_id: str = config["user_id"]
2024-09-04 05:47:13 -06:00
early_exit: Optional[int] = config.get("early_exit")
2024-07-26 10:36:56 -06:00
print("fetching note ids", file=sys.stderr)
note_ids = set()
cur = conn.execute(
'select id from note where "userId" = %s and not ("renoteId" is not null and text is null)',
[user_id],
)
while rows := cur.fetchmany(0xFF):
2024-10-04 16:43:40 -06:00
time.sleep(0.0001)
2024-07-26 10:36:56 -06:00
for row in rows:
note_ids.add(row[0])
if early_exit and len(note_ids) > early_exit:
break
@cache
def get_note(id: str) -> Note:
2024-10-04 16:43:40 -06:00
time.sleep(0.0001)
2024-07-26 10:36:56 -06:00
return Note(
*conn.execute(
'select "renoteId", "replyId", "userId" from note where id = %s', [id]
).fetchone()
)
roots = {}
trees = {}
def tree_init(id: str, seek: bool = True) -> Tree:
if tree := trees.get(id):
return tree
tree = Tree(id, [], [])
note = get_note(id)
if note.reply_id or note.renote_id:
if note.reply_id:
p_tree = tree_init(note.reply_id)
p_tree.replies.append(tree)
if note.renote_id:
r_tree = tree_init(note.renote_id, False)
r_tree.renotes.append(tree)
else:
roots[id] = tree
trees[id] = tree
return tree
def make_widgets(msg, trees, roots):
widgets = [
f"{msg} ",
progressbar.Percentage(),
" ",
progressbar.Bar(),
" ",
progressbar.SimpleProgress("%(value_s)s/%(max_value_s)s"),
" ",
]
if trees:
widgets += [progressbar.Variable("trees"), " "]
if roots:
widgets += [progressbar.Variable("roots"), " "]
widgets += [progressbar.ETA()]
return widgets
pb = progressbar.ProgressBar(
0,
len(note_ids),
widgets=make_widgets("building trees", True, True),
)
for note_id in note_ids:
tree_init(note_id)
pb.increment(trees=len(trees), roots=len(roots))
pb.finish()
def traverse(tree: Tree):
note = get_note(tree.id)
if note.user_id == user_id:
expand(tree)
else:
for child in tree.replies:
traverse(child)
def expand(tree: Tree):
2024-10-04 16:43:40 -06:00
time.sleep(0.0001)
2024-07-26 10:36:56 -06:00
for row in conn.execute(
"select id from note_replies(%s, 1, 1000)", [tree.id]
).fetchall():
if row[0] in trees:
continue
note = get_note(row[0])
new = Tree(row[0], [], [])
if note.reply_id == tree.id:
# is a reply
tree.replies.append(new)
trees[row[0]] = new
if note.renote_id == tree.id:
# is a renote
tree.renotes.append(new)
trees[row[0]] = new
for child in tree.replies:
expand(child)
roots_len = len(roots)
pb = progressbar.ProgressBar(
0, roots_len, widgets=make_widgets("expanding roots", True, False)
)
for root in roots.values():
traverse(root)
pb.increment(trees=len(trees))
pb.finish()
with Path("graph.db").open("w") as f:
pb = progressbar.ProgressBar(
0, len(trees), widgets=make_widgets("saving graph", False, False)
)
for key, tree in trees.items():
note = get_note(tree.id)
is_root = tree.id in roots
f.write(f"{tree.id}\t")
f.write(",".join((reply.id for reply in tree.replies)))
f.write(f"\t")
f.write(",".join((renote.id for renote in tree.renotes)))
f.write(f"\t")
flags = []
if tree.id in roots:
flags.append("root")
if note.user_id == user_id:
flags.append("self")
f.write(",".join(flags))
f.write(f"\n")
pb.increment()
pb.finish()