mirror of
https://git.mia.jetzt/scrubber
synced 2024-11-23 13:57:24 -07:00
desktop changes
This commit is contained in:
parent
81071e8fee
commit
bb8a48fd4d
11 changed files with 306 additions and 30 deletions
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
/__pycache__/
|
||||
/graph.db
|
||||
/filtered.list
|
||||
/out/
|
||||
/sec.py
|
12
1_graph.py
12
1_graph.py
|
@ -3,24 +3,20 @@ import sys
|
|||
from collections import namedtuple
|
||||
from functools import cache
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import psycopg
|
||||
|
||||
try:
|
||||
import progressbar2 as progressbar
|
||||
except ImportError:
|
||||
import progressbar
|
||||
from com import eval_config, progressbar
|
||||
|
||||
|
||||
Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"])
|
||||
Tree = namedtuple("Tree", ["id", "replies", "renotes"])
|
||||
|
||||
print("configuring")
|
||||
config = {}
|
||||
exec(Path("config.py").read_text(), config)
|
||||
config = eval_config()
|
||||
conn: psycopg.Connection = config["connect"]()
|
||||
user_id: str = config["user_id"]
|
||||
early_exit = config.get("early_exit")
|
||||
early_exit: Optional[int] = config.get("early_exit")
|
||||
|
||||
|
||||
print("fetching note ids", file=sys.stderr)
|
||||
|
|
24
2_filter.py
24
2_filter.py
|
@ -4,32 +4,14 @@ from typing import Callable, List
|
|||
|
||||
import psycopg
|
||||
|
||||
from ty import FilterableNote, Visibility
|
||||
|
||||
try:
|
||||
import progressbar2 as progressbar
|
||||
except ImportError:
|
||||
import progressbar
|
||||
from com import FilterableNote, Visibility, eval_config, parse_graph, progressbar
|
||||
|
||||
|
||||
print("configuring")
|
||||
config = {}
|
||||
exec(Path("config.py").read_text(), config)
|
||||
config = eval_config()
|
||||
conn: psycopg.Connection = config["connect"]()
|
||||
criteria: Callable[[FilterableNote], bool] = config["criteria"]
|
||||
|
||||
intermediate = {}
|
||||
|
||||
print("parsing")
|
||||
for line in Path("graph.db").read_text().splitlines():
|
||||
id, replies, quotes, flags = line.split("\t")
|
||||
intermediate[id] = {
|
||||
"id": id,
|
||||
"replies": replies.split(",") if len(replies) > 0 else [],
|
||||
"quotes": quotes.split(",") if len(quotes) > 0 else [],
|
||||
"flags": flags.split(",") if len(flags) > 0 else [],
|
||||
}
|
||||
|
||||
intermediate = parse_graph()
|
||||
|
||||
def transform(entry: dict) -> FilterableNote:
|
||||
note = conn.execute(
|
||||
|
|
144
3_archive.py
Normal file
144
3_archive.py
Normal file
|
@ -0,0 +1,144 @@
|
|||
import json
|
||||
from http.client import HTTPResponse
|
||||
from pathlib import Path
|
||||
from shutil import copyfileobj
|
||||
from urllib.request import urlopen
|
||||
|
||||
import brotli
|
||||
import msgpack
|
||||
import psycopg
|
||||
|
||||
from com import Visibility, eval_config, parse_graph, progressbar
|
||||
|
||||
config = eval_config()
|
||||
conn: psycopg.Connection = config["connect"]()
|
||||
|
||||
graph = parse_graph()
|
||||
print("reading filterlist")
|
||||
filtered = Path("filtered.list").read_text().strip().splitlines()
|
||||
|
||||
collected_users = {}
|
||||
def collect_user(id: str):
|
||||
if id in collected_users:
|
||||
return
|
||||
user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone()
|
||||
if user is None:
|
||||
return None
|
||||
username, host, avatar_url = user
|
||||
profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone()
|
||||
description, fields = profile or ("", [])
|
||||
|
||||
output = {}
|
||||
output["id"] = id
|
||||
output["username"] = username
|
||||
output["host"] = host
|
||||
output["description"] = description
|
||||
output["fields"] = fields
|
||||
output["avatar_url"] = avatar_url
|
||||
|
||||
collected_users[id] = output
|
||||
|
||||
collected_notes = []
|
||||
files_to_collect = []
|
||||
def collect_note(id: str):
|
||||
output = {}
|
||||
output["id"] = id
|
||||
|
||||
note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone()
|
||||
if note is None:
|
||||
return None
|
||||
text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note
|
||||
collect_user(user_id)
|
||||
|
||||
output["text"] = text
|
||||
output["user_id"] = user_id
|
||||
output["created_at"] = created_at.astimezone(tz=None).isoformat()
|
||||
output["updated_at"] = None
|
||||
if updated_at is not None:
|
||||
output["updated_at"] = updated_at.astimezone(tz=None).isoformat()
|
||||
output["reactions"] = reactions
|
||||
output["renotes"] = renotes
|
||||
output["visibility"] = Visibility.from_db(visibility).code()
|
||||
|
||||
node = graph[id]
|
||||
replies = [collect_note(reply) for reply in node["replies"]]
|
||||
replies = filter(lambda reply: reply is not None, replies)
|
||||
quotes = [collect_note(quote) for quote in node["quotes"]]
|
||||
quotes = filter(lambda quote: quote is not None, quotes)
|
||||
|
||||
output["attachments"] = []
|
||||
for file_id in file_ids:
|
||||
name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone()
|
||||
attachment = {
|
||||
"id": file_id,
|
||||
"type": type_,
|
||||
"comment": comment,
|
||||
}
|
||||
if "self" in node["flags"]: # archive own attachments
|
||||
files_to_collect.append((file_id, url))
|
||||
attachment["url"] = None
|
||||
else:
|
||||
attachment["url"] = url
|
||||
|
||||
output["replies"] = list(replies)
|
||||
output["quotes"] = list(quotes)
|
||||
|
||||
if len(output["attachments"]) == 0: del output["attachments"]
|
||||
if len(output["replies"]) == 0: del output["replies"]
|
||||
if len(output["quotes"]) == 0: del output["quotes"]
|
||||
|
||||
return output
|
||||
|
||||
pb = progressbar.ProgressBar(
|
||||
0,
|
||||
len(filtered),
|
||||
prefix="collecting data ",
|
||||
)
|
||||
for id in filtered:
|
||||
note = collect_note(id)
|
||||
collected_notes.append((id, note))
|
||||
pb.increment()
|
||||
pb.finish()
|
||||
|
||||
outdir = Path("out")
|
||||
if not outdir.exists():
|
||||
outdir.mkdir()
|
||||
if not (outdir / "note").exists():
|
||||
(outdir / "note").mkdir()
|
||||
if not (outdir / "user").exists():
|
||||
(outdir / "user").mkdir()
|
||||
if not (outdir / "file").exists():
|
||||
(outdir / "file").mkdir()
|
||||
|
||||
pb = progressbar.ProgressBar(
|
||||
0,
|
||||
len(collected_notes) + len(collected_users),
|
||||
prefix="writing data ",
|
||||
)
|
||||
|
||||
for id, note in collected_notes:
|
||||
outfile = outdir / "note" / f"{id}.mpk.br"
|
||||
with outfile.open("wb") as f:
|
||||
f.write(brotli.compress(msgpack.dumps(note)))
|
||||
pb.increment()
|
||||
|
||||
for id, user in collected_users.items():
|
||||
outfile = outdir / "user" / f"{id}.mpk.br"
|
||||
with outfile.open("wb") as f:
|
||||
f.write(brotli.compress(msgpack.dumps(note)))
|
||||
pb.increment()
|
||||
pb.finish()
|
||||
|
||||
pb = progressbar.ProgressBar(
|
||||
0,
|
||||
len(files_to_collect),
|
||||
prefix="downloading attachments ",
|
||||
)
|
||||
for (id, url) in files_to_collect:
|
||||
outfile = outdir / "file" / id
|
||||
response: HTTPResponse = urlopen(url)
|
||||
with outfile.open("wb") as f:
|
||||
copyfileobj(response, f)
|
||||
response.close()
|
||||
pb.increment()
|
||||
pb.finish()
|
33
4_delete.py
Normal file
33
4_delete.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import psycopg
|
||||
|
||||
from com import eval_config, parse_graph, progressbar
|
||||
|
||||
config = eval_config()
|
||||
conn: psycopg.Connection = config["connect"]()
|
||||
token: str = config["token"]
|
||||
api: str = config["api"]
|
||||
|
||||
graph = parse_graph()
|
||||
print("reading filterlist")
|
||||
filtered = Path("filtered.list").read_text().strip().splitlines()
|
||||
|
||||
queue = []
|
||||
|
||||
def enqueue(note):
|
||||
for reply in note["replies"]:
|
||||
enqueue(graph[reply])
|
||||
for quote in note["quotes"]:
|
||||
enqueue(graph[quote])
|
||||
if "self" in note["flags"]:
|
||||
files = conn.execute('select "fileIds" from note where id = %s', [note["id"]]).fetchone()[0]
|
||||
queue.append((note["id"], files))
|
||||
|
||||
for id in filtered:
|
||||
enqueue(graph[id])
|
||||
|
||||
print(queue)
|
||||
|
||||
# client = httpx.Client()
|
|
@ -1,7 +1,15 @@
|
|||
import sys
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Callable
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, List
|
||||
|
||||
try:
|
||||
import progressbar2 as progressbar
|
||||
except ImportError:
|
||||
import progressbar
|
||||
|
||||
|
||||
class Visibility(Enum):
|
||||
public = 1
|
||||
|
@ -18,6 +26,13 @@ class Visibility(Enum):
|
|||
case "specified": return cls.direct
|
||||
case _: raise ValueError(f"unknown visibility `{raw}`")
|
||||
|
||||
def code(self) -> str:
|
||||
match self:
|
||||
case self.public: return "p"
|
||||
case self.unlisted: return "u"
|
||||
case self.followers: return "f"
|
||||
case self.direct: return "d"
|
||||
|
||||
|
||||
@dataclass
|
||||
class FilterableNote:
|
||||
|
@ -59,3 +74,24 @@ class FilterableNote:
|
|||
"reactions": self.reactions,
|
||||
"renotes": self.renotes,
|
||||
}
|
||||
|
||||
|
||||
def eval_config() -> dict:
|
||||
print("configuring")
|
||||
config = {}
|
||||
exec(Path(sys.argv[1]).read_text(), config)
|
||||
return config
|
||||
|
||||
|
||||
def parse_graph() -> Dict[str, dict]:
|
||||
print("parsing graph")
|
||||
graph = {}
|
||||
for line in Path("graph.db").read_text().splitlines():
|
||||
id, replies, quotes, flags = line.split("\t")
|
||||
graph[id] = {
|
||||
"id": id,
|
||||
"replies": replies.split(",") if len(replies) > 0 else [],
|
||||
"quotes": quotes.split(",") if len(quotes) > 0 else [],
|
||||
"flags": flags.split(",") if len(flags) > 0 else [],
|
||||
}
|
||||
return graph
|
46
conf_mia.py
Normal file
46
conf_mia.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
import math
|
||||
from datetime import UTC, datetime, timedelta
|
||||
|
||||
from com import FilterableNote, Visibility
|
||||
from sec import connect, tokens
|
||||
|
||||
user_id = "9gf2ev4ex5dflllo"
|
||||
token = tokens["mia"]
|
||||
api = "https://void.rehab/api/"
|
||||
early_exit = 0xFFF
|
||||
|
||||
now = datetime.now(UTC)
|
||||
threshold = 0.1
|
||||
|
||||
def criteria(root: FilterableNote) -> bool:
|
||||
thread = root.thread()
|
||||
thread_self = root.thread_self()
|
||||
|
||||
# if there are dms involved...
|
||||
low_vis = min(thread, key=lambda note: note.visibility.value)
|
||||
if low_vis.visibility == Visibility.direct:
|
||||
is_direct = lambda note: note.visibility == Visibility.direct
|
||||
most_recent_direct = max(filter(is_direct, thread), key=lambda note: note.when)
|
||||
# ...and the dms are younger than two months...
|
||||
if now - most_recent_direct.when < timedelta(days=30 * 2):
|
||||
# ...do not delete the thread
|
||||
return False
|
||||
|
||||
# get the most recent post...
|
||||
others_recency = max(thread, key=lambda note: note.when)
|
||||
# ...and bail if it's too new
|
||||
if now - others_recency.when < timedelta(days=14):
|
||||
return False
|
||||
|
||||
# get my...
|
||||
most_recent_post = max(thread_self, key=lambda note: note.when) # ...most recent post...
|
||||
score = lambda note: note.reactions + note.renotes*5
|
||||
high_score_post = max(thread_self, key=score) # ...highest scoring post...
|
||||
# ...and their values...
|
||||
most_recent = most_recent_post.when
|
||||
most_recent_age = now - most_recent
|
||||
high_score = score(high_score_post)
|
||||
# ...weigh it...
|
||||
weighted_score = high_score / math.sqrt(most_recent_age.days)
|
||||
# ...and check it against a threshold
|
||||
return weighted_score < threshold
|
14
conf_pain.py
Normal file
14
conf_pain.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
import math
|
||||
from datetime import UTC, datetime, timedelta
|
||||
|
||||
from com import FilterableNote
|
||||
from sec import connect, tokens
|
||||
|
||||
user_id = "9gszslkcdfnomssj"
|
||||
token = tokens["pain"]
|
||||
api = "https://void.rehab/api/"
|
||||
|
||||
def criteria(root: FilterableNote) -> bool:
|
||||
# if it's more than two months old, delete
|
||||
# return (datetime.now(UTC) - root.when).days > 60
|
||||
return (datetime.now(UTC) - root.when).days > (12 * 30)
|
13
go.sh
Executable file
13
go.sh
Executable file
|
@ -0,0 +1,13 @@
|
|||
#!/bin/sh
|
||||
|
||||
set -ex
|
||||
|
||||
test -f graph.db && rm graph.db
|
||||
test -f filtered.list && rm filtered.list
|
||||
test -d out && rm -r out
|
||||
python3 1_graph.py conf_$1.py
|
||||
python3 2_filter.py conf_$1.py
|
||||
# python3 3_archive.py conf_$1.py
|
||||
# echo uploading to memorial
|
||||
# rsync -r -e 'ssh -p23' --progress out/ memorial:fediverse/$1/
|
||||
# python3 4_delete.py conf_$1.py
|
2
proxy.sh
Executable file
2
proxy.sh
Executable file
|
@ -0,0 +1,2 @@
|
|||
#!/bin/sh
|
||||
exec ssh -NL 5432:localhost:5432 vr
|
|
@ -0,0 +1,5 @@
|
|||
httpx
|
||||
progressbar2
|
||||
psycopg
|
||||
brotli
|
||||
msgpack
|
Loading…
Reference in a new issue