scrubber/2_filter.py

85 lines
2.1 KiB
Python
Raw Normal View History

2024-07-26 10:36:56 -06:00
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, List
import psycopg
from ty import FilterableNote, Visibility
try:
import progressbar2 as progressbar
except ImportError:
import progressbar
print("configuring")
config = {}
exec(Path("config.py").read_text(), config)
conn: psycopg.Connection = config["connect"]()
criteria: Callable[[FilterableNote], bool] = config["criteria"]
intermediate = {}
print("parsing")
for line in Path("graph.db").read_text().splitlines():
id, replies, quotes, flags = line.split("\t")
intermediate[id] = {
"id": id,
"replies": replies.split(",") if len(replies) > 0 else [],
"quotes": quotes.split(",") if len(quotes) > 0 else [],
"flags": flags.split(",") if len(flags) > 0 else [],
}
def transform(entry: dict) -> FilterableNote:
note = conn.execute(
'select "createdAt", reactions, "renoteCount", visibility from note where id = %s',
[entry["id"]],
).fetchone()
if note is None:
return None # part of thread disappeared during processing
when, reactions, renotes, visibility = note
replies = [transform(intermediate[reply]) for reply in entry["replies"]]
quotes = [transform(intermediate[quote]) for quote in entry["quotes"]]
if None in replies or None in quotes:
return None # bubble up, buttercup
return FilterableNote(
entry["id"],
"self" in entry["flags"],
replies,
quotes,
when.astimezone(),
sum(reactions.values()),
renotes,
Visibility.from_db(visibility),
)
root_count = 0
for entry in intermediate.values():
if "root" in entry["flags"]:
root_count += 1
pb = progressbar.ProgressBar(
0,
root_count,
prefix="processing ",
)
targets = []
for entry in intermediate.values():
if "root" not in entry["flags"]:
continue
transformed = transform(entry)
if transformed is None:
continue # we'll get to it next cycle
if criteria(transformed):
targets.append(entry["id"])
pb.increment()
pb.finish()
Path("filtered.list").write_text("\n".join(targets))