mirror of
https://git.mia.jetzt/scrubber
synced 2024-11-23 13:57:24 -07:00
desktop changes
This commit is contained in:
parent
81071e8fee
commit
bb8a48fd4d
11 changed files with 306 additions and 30 deletions
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
/__pycache__/
|
||||||
|
/graph.db
|
||||||
|
/filtered.list
|
||||||
|
/out/
|
||||||
|
/sec.py
|
12
1_graph.py
12
1_graph.py
|
@ -3,24 +3,20 @@ import sys
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import psycopg
|
import psycopg
|
||||||
|
|
||||||
try:
|
from com import eval_config, progressbar
|
||||||
import progressbar2 as progressbar
|
|
||||||
except ImportError:
|
|
||||||
import progressbar
|
|
||||||
|
|
||||||
|
|
||||||
Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"])
|
Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"])
|
||||||
Tree = namedtuple("Tree", ["id", "replies", "renotes"])
|
Tree = namedtuple("Tree", ["id", "replies", "renotes"])
|
||||||
|
|
||||||
print("configuring")
|
config = eval_config()
|
||||||
config = {}
|
|
||||||
exec(Path("config.py").read_text(), config)
|
|
||||||
conn: psycopg.Connection = config["connect"]()
|
conn: psycopg.Connection = config["connect"]()
|
||||||
user_id: str = config["user_id"]
|
user_id: str = config["user_id"]
|
||||||
early_exit = config.get("early_exit")
|
early_exit: Optional[int] = config.get("early_exit")
|
||||||
|
|
||||||
|
|
||||||
print("fetching note ids", file=sys.stderr)
|
print("fetching note ids", file=sys.stderr)
|
||||||
|
|
24
2_filter.py
24
2_filter.py
|
@ -4,32 +4,14 @@ from typing import Callable, List
|
||||||
|
|
||||||
import psycopg
|
import psycopg
|
||||||
|
|
||||||
from ty import FilterableNote, Visibility
|
from com import FilterableNote, Visibility, eval_config, parse_graph, progressbar
|
||||||
|
|
||||||
try:
|
|
||||||
import progressbar2 as progressbar
|
|
||||||
except ImportError:
|
|
||||||
import progressbar
|
|
||||||
|
|
||||||
|
|
||||||
print("configuring")
|
config = eval_config()
|
||||||
config = {}
|
|
||||||
exec(Path("config.py").read_text(), config)
|
|
||||||
conn: psycopg.Connection = config["connect"]()
|
conn: psycopg.Connection = config["connect"]()
|
||||||
criteria: Callable[[FilterableNote], bool] = config["criteria"]
|
criteria: Callable[[FilterableNote], bool] = config["criteria"]
|
||||||
|
|
||||||
intermediate = {}
|
intermediate = parse_graph()
|
||||||
|
|
||||||
print("parsing")
|
|
||||||
for line in Path("graph.db").read_text().splitlines():
|
|
||||||
id, replies, quotes, flags = line.split("\t")
|
|
||||||
intermediate[id] = {
|
|
||||||
"id": id,
|
|
||||||
"replies": replies.split(",") if len(replies) > 0 else [],
|
|
||||||
"quotes": quotes.split(",") if len(quotes) > 0 else [],
|
|
||||||
"flags": flags.split(",") if len(flags) > 0 else [],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def transform(entry: dict) -> FilterableNote:
|
def transform(entry: dict) -> FilterableNote:
|
||||||
note = conn.execute(
|
note = conn.execute(
|
||||||
|
|
144
3_archive.py
Normal file
144
3_archive.py
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
import json
|
||||||
|
from http.client import HTTPResponse
|
||||||
|
from pathlib import Path
|
||||||
|
from shutil import copyfileobj
|
||||||
|
from urllib.request import urlopen
|
||||||
|
|
||||||
|
import brotli
|
||||||
|
import msgpack
|
||||||
|
import psycopg
|
||||||
|
|
||||||
|
from com import Visibility, eval_config, parse_graph, progressbar
|
||||||
|
|
||||||
|
config = eval_config()
|
||||||
|
conn: psycopg.Connection = config["connect"]()
|
||||||
|
|
||||||
|
graph = parse_graph()
|
||||||
|
print("reading filterlist")
|
||||||
|
filtered = Path("filtered.list").read_text().strip().splitlines()
|
||||||
|
|
||||||
|
collected_users = {}
|
||||||
|
def collect_user(id: str):
|
||||||
|
if id in collected_users:
|
||||||
|
return
|
||||||
|
user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone()
|
||||||
|
if user is None:
|
||||||
|
return None
|
||||||
|
username, host, avatar_url = user
|
||||||
|
profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone()
|
||||||
|
description, fields = profile or ("", [])
|
||||||
|
|
||||||
|
output = {}
|
||||||
|
output["id"] = id
|
||||||
|
output["username"] = username
|
||||||
|
output["host"] = host
|
||||||
|
output["description"] = description
|
||||||
|
output["fields"] = fields
|
||||||
|
output["avatar_url"] = avatar_url
|
||||||
|
|
||||||
|
collected_users[id] = output
|
||||||
|
|
||||||
|
collected_notes = []
|
||||||
|
files_to_collect = []
|
||||||
|
def collect_note(id: str):
|
||||||
|
output = {}
|
||||||
|
output["id"] = id
|
||||||
|
|
||||||
|
note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone()
|
||||||
|
if note is None:
|
||||||
|
return None
|
||||||
|
text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note
|
||||||
|
collect_user(user_id)
|
||||||
|
|
||||||
|
output["text"] = text
|
||||||
|
output["user_id"] = user_id
|
||||||
|
output["created_at"] = created_at.astimezone(tz=None).isoformat()
|
||||||
|
output["updated_at"] = None
|
||||||
|
if updated_at is not None:
|
||||||
|
output["updated_at"] = updated_at.astimezone(tz=None).isoformat()
|
||||||
|
output["reactions"] = reactions
|
||||||
|
output["renotes"] = renotes
|
||||||
|
output["visibility"] = Visibility.from_db(visibility).code()
|
||||||
|
|
||||||
|
node = graph[id]
|
||||||
|
replies = [collect_note(reply) for reply in node["replies"]]
|
||||||
|
replies = filter(lambda reply: reply is not None, replies)
|
||||||
|
quotes = [collect_note(quote) for quote in node["quotes"]]
|
||||||
|
quotes = filter(lambda quote: quote is not None, quotes)
|
||||||
|
|
||||||
|
output["attachments"] = []
|
||||||
|
for file_id in file_ids:
|
||||||
|
name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone()
|
||||||
|
attachment = {
|
||||||
|
"id": file_id,
|
||||||
|
"type": type_,
|
||||||
|
"comment": comment,
|
||||||
|
}
|
||||||
|
if "self" in node["flags"]: # archive own attachments
|
||||||
|
files_to_collect.append((file_id, url))
|
||||||
|
attachment["url"] = None
|
||||||
|
else:
|
||||||
|
attachment["url"] = url
|
||||||
|
|
||||||
|
output["replies"] = list(replies)
|
||||||
|
output["quotes"] = list(quotes)
|
||||||
|
|
||||||
|
if len(output["attachments"]) == 0: del output["attachments"]
|
||||||
|
if len(output["replies"]) == 0: del output["replies"]
|
||||||
|
if len(output["quotes"]) == 0: del output["quotes"]
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
pb = progressbar.ProgressBar(
|
||||||
|
0,
|
||||||
|
len(filtered),
|
||||||
|
prefix="collecting data ",
|
||||||
|
)
|
||||||
|
for id in filtered:
|
||||||
|
note = collect_note(id)
|
||||||
|
collected_notes.append((id, note))
|
||||||
|
pb.increment()
|
||||||
|
pb.finish()
|
||||||
|
|
||||||
|
outdir = Path("out")
|
||||||
|
if not outdir.exists():
|
||||||
|
outdir.mkdir()
|
||||||
|
if not (outdir / "note").exists():
|
||||||
|
(outdir / "note").mkdir()
|
||||||
|
if not (outdir / "user").exists():
|
||||||
|
(outdir / "user").mkdir()
|
||||||
|
if not (outdir / "file").exists():
|
||||||
|
(outdir / "file").mkdir()
|
||||||
|
|
||||||
|
pb = progressbar.ProgressBar(
|
||||||
|
0,
|
||||||
|
len(collected_notes) + len(collected_users),
|
||||||
|
prefix="writing data ",
|
||||||
|
)
|
||||||
|
|
||||||
|
for id, note in collected_notes:
|
||||||
|
outfile = outdir / "note" / f"{id}.mpk.br"
|
||||||
|
with outfile.open("wb") as f:
|
||||||
|
f.write(brotli.compress(msgpack.dumps(note)))
|
||||||
|
pb.increment()
|
||||||
|
|
||||||
|
for id, user in collected_users.items():
|
||||||
|
outfile = outdir / "user" / f"{id}.mpk.br"
|
||||||
|
with outfile.open("wb") as f:
|
||||||
|
f.write(brotli.compress(msgpack.dumps(note)))
|
||||||
|
pb.increment()
|
||||||
|
pb.finish()
|
||||||
|
|
||||||
|
pb = progressbar.ProgressBar(
|
||||||
|
0,
|
||||||
|
len(files_to_collect),
|
||||||
|
prefix="downloading attachments ",
|
||||||
|
)
|
||||||
|
for (id, url) in files_to_collect:
|
||||||
|
outfile = outdir / "file" / id
|
||||||
|
response: HTTPResponse = urlopen(url)
|
||||||
|
with outfile.open("wb") as f:
|
||||||
|
copyfileobj(response, f)
|
||||||
|
response.close()
|
||||||
|
pb.increment()
|
||||||
|
pb.finish()
|
33
4_delete.py
Normal file
33
4_delete.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import psycopg
|
||||||
|
|
||||||
|
from com import eval_config, parse_graph, progressbar
|
||||||
|
|
||||||
|
config = eval_config()
|
||||||
|
conn: psycopg.Connection = config["connect"]()
|
||||||
|
token: str = config["token"]
|
||||||
|
api: str = config["api"]
|
||||||
|
|
||||||
|
graph = parse_graph()
|
||||||
|
print("reading filterlist")
|
||||||
|
filtered = Path("filtered.list").read_text().strip().splitlines()
|
||||||
|
|
||||||
|
queue = []
|
||||||
|
|
||||||
|
def enqueue(note):
|
||||||
|
for reply in note["replies"]:
|
||||||
|
enqueue(graph[reply])
|
||||||
|
for quote in note["quotes"]:
|
||||||
|
enqueue(graph[quote])
|
||||||
|
if "self" in note["flags"]:
|
||||||
|
files = conn.execute('select "fileIds" from note where id = %s', [note["id"]]).fetchone()[0]
|
||||||
|
queue.append((note["id"], files))
|
||||||
|
|
||||||
|
for id in filtered:
|
||||||
|
enqueue(graph[id])
|
||||||
|
|
||||||
|
print(queue)
|
||||||
|
|
||||||
|
# client = httpx.Client()
|
|
@ -1,7 +1,15 @@
|
||||||
|
import sys
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Callable
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable, Dict, List
|
||||||
|
|
||||||
|
try:
|
||||||
|
import progressbar2 as progressbar
|
||||||
|
except ImportError:
|
||||||
|
import progressbar
|
||||||
|
|
||||||
|
|
||||||
class Visibility(Enum):
|
class Visibility(Enum):
|
||||||
public = 1
|
public = 1
|
||||||
|
@ -17,6 +25,13 @@ class Visibility(Enum):
|
||||||
case "followers": return cls.followers
|
case "followers": return cls.followers
|
||||||
case "specified": return cls.direct
|
case "specified": return cls.direct
|
||||||
case _: raise ValueError(f"unknown visibility `{raw}`")
|
case _: raise ValueError(f"unknown visibility `{raw}`")
|
||||||
|
|
||||||
|
def code(self) -> str:
|
||||||
|
match self:
|
||||||
|
case self.public: return "p"
|
||||||
|
case self.unlisted: return "u"
|
||||||
|
case self.followers: return "f"
|
||||||
|
case self.direct: return "d"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -59,3 +74,24 @@ class FilterableNote:
|
||||||
"reactions": self.reactions,
|
"reactions": self.reactions,
|
||||||
"renotes": self.renotes,
|
"renotes": self.renotes,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def eval_config() -> dict:
|
||||||
|
print("configuring")
|
||||||
|
config = {}
|
||||||
|
exec(Path(sys.argv[1]).read_text(), config)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def parse_graph() -> Dict[str, dict]:
|
||||||
|
print("parsing graph")
|
||||||
|
graph = {}
|
||||||
|
for line in Path("graph.db").read_text().splitlines():
|
||||||
|
id, replies, quotes, flags = line.split("\t")
|
||||||
|
graph[id] = {
|
||||||
|
"id": id,
|
||||||
|
"replies": replies.split(",") if len(replies) > 0 else [],
|
||||||
|
"quotes": quotes.split(",") if len(quotes) > 0 else [],
|
||||||
|
"flags": flags.split(",") if len(flags) > 0 else [],
|
||||||
|
}
|
||||||
|
return graph
|
46
conf_mia.py
Normal file
46
conf_mia.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
import math
|
||||||
|
from datetime import UTC, datetime, timedelta
|
||||||
|
|
||||||
|
from com import FilterableNote, Visibility
|
||||||
|
from sec import connect, tokens
|
||||||
|
|
||||||
|
user_id = "9gf2ev4ex5dflllo"
|
||||||
|
token = tokens["mia"]
|
||||||
|
api = "https://void.rehab/api/"
|
||||||
|
early_exit = 0xFFF
|
||||||
|
|
||||||
|
now = datetime.now(UTC)
|
||||||
|
threshold = 0.1
|
||||||
|
|
||||||
|
def criteria(root: FilterableNote) -> bool:
|
||||||
|
thread = root.thread()
|
||||||
|
thread_self = root.thread_self()
|
||||||
|
|
||||||
|
# if there are dms involved...
|
||||||
|
low_vis = min(thread, key=lambda note: note.visibility.value)
|
||||||
|
if low_vis.visibility == Visibility.direct:
|
||||||
|
is_direct = lambda note: note.visibility == Visibility.direct
|
||||||
|
most_recent_direct = max(filter(is_direct, thread), key=lambda note: note.when)
|
||||||
|
# ...and the dms are younger than two months...
|
||||||
|
if now - most_recent_direct.when < timedelta(days=30 * 2):
|
||||||
|
# ...do not delete the thread
|
||||||
|
return False
|
||||||
|
|
||||||
|
# get the most recent post...
|
||||||
|
others_recency = max(thread, key=lambda note: note.when)
|
||||||
|
# ...and bail if it's too new
|
||||||
|
if now - others_recency.when < timedelta(days=14):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# get my...
|
||||||
|
most_recent_post = max(thread_self, key=lambda note: note.when) # ...most recent post...
|
||||||
|
score = lambda note: note.reactions + note.renotes*5
|
||||||
|
high_score_post = max(thread_self, key=score) # ...highest scoring post...
|
||||||
|
# ...and their values...
|
||||||
|
most_recent = most_recent_post.when
|
||||||
|
most_recent_age = now - most_recent
|
||||||
|
high_score = score(high_score_post)
|
||||||
|
# ...weigh it...
|
||||||
|
weighted_score = high_score / math.sqrt(most_recent_age.days)
|
||||||
|
# ...and check it against a threshold
|
||||||
|
return weighted_score < threshold
|
14
conf_pain.py
Normal file
14
conf_pain.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
import math
|
||||||
|
from datetime import UTC, datetime, timedelta
|
||||||
|
|
||||||
|
from com import FilterableNote
|
||||||
|
from sec import connect, tokens
|
||||||
|
|
||||||
|
user_id = "9gszslkcdfnomssj"
|
||||||
|
token = tokens["pain"]
|
||||||
|
api = "https://void.rehab/api/"
|
||||||
|
|
||||||
|
def criteria(root: FilterableNote) -> bool:
|
||||||
|
# if it's more than two months old, delete
|
||||||
|
# return (datetime.now(UTC) - root.when).days > 60
|
||||||
|
return (datetime.now(UTC) - root.when).days > (12 * 30)
|
13
go.sh
Executable file
13
go.sh
Executable file
|
@ -0,0 +1,13 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
test -f graph.db && rm graph.db
|
||||||
|
test -f filtered.list && rm filtered.list
|
||||||
|
test -d out && rm -r out
|
||||||
|
python3 1_graph.py conf_$1.py
|
||||||
|
python3 2_filter.py conf_$1.py
|
||||||
|
# python3 3_archive.py conf_$1.py
|
||||||
|
# echo uploading to memorial
|
||||||
|
# rsync -r -e 'ssh -p23' --progress out/ memorial:fediverse/$1/
|
||||||
|
# python3 4_delete.py conf_$1.py
|
2
proxy.sh
Executable file
2
proxy.sh
Executable file
|
@ -0,0 +1,2 @@
|
||||||
|
#!/bin/sh
|
||||||
|
exec ssh -NL 5432:localhost:5432 vr
|
|
@ -0,0 +1,5 @@
|
||||||
|
httpx
|
||||||
|
progressbar2
|
||||||
|
psycopg
|
||||||
|
brotli
|
||||||
|
msgpack
|
Loading…
Reference in a new issue