prune_objects can prune orphaned activities who reference an array of objects

E.g. Flag activities have an array of objects

We prune the activity when NONE of the objects can be found

Note that the cost of finding and deleting these is ~4x higher than finding and deleting the non-array ones

Only string:
Delete on activities  (cost=506573.48..506580.38 rows=0 width=0)

Only Array:
Delete on activities  (cost=3570359.68..4276365.34 rows=0 width=0)

(They are still executed separately, so the total cost is the sum of the two)
This commit is contained in:
ilja 2023-01-08 18:22:53 +01:00
parent a7ec6e039c
commit 57eef6d764
2 changed files with 94 additions and 18 deletions

View file

@ -172,35 +172,48 @@ defmodule Mix.Tasks.Pleroma.Database do
|> Repo.delete_all(timeout: :infinity) |> Repo.delete_all(timeout: :infinity)
if Keyword.get(options, :prune_orphaned_activities) do if Keyword.get(options, :prune_orphaned_activities) do
# Prune activities who link to a single object
""" """
delete from public.activities delete from public.activities
where id in ( where id in (
select a.id from public.activities a select a.id from public.activities a
left join public.objects o on a.data ->> 'object' = o.data ->> 'id' left join public.objects o on a.data ->> 'object' = o.data ->> 'id'
left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id'
left join public.users u on a.data ->> 'object' = u.ap_id left join public.users u on a.data ->> 'object' = u.ap_id
-- Only clean up remote activities where not a.local
where not a.local and jsonb_typeof(a."data" -> 'object') = 'string'
-- For now we only focus on activities with direct links to objects and o.id is null
-- e.g. not json objects (in case of embedded objects) or json arrays (in case of multiple objects) and a2.id is null
and jsonb_typeof(a."data" -> 'object') = 'string' and u.id is null
-- Find Activities that don't have existing objects
and o.id is null
and a2.id is null
and u.id is null
) )
""" """
|> Repo.query() |> Repo.query([], timeout: :infinity)
# Prune activities who link to an array of objects
"""
delete from public.activities
where id in (
select a.id from public.activities a
join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array'
left join public.objects o on j.value = o.data ->> 'id'
left join public.activities a2 on j.value = a2.data ->> 'id'
left join public.users u on j.value = u.ap_id
group by a.id
having max(o.data ->> 'id') is null
and max(a2.data ->> 'id') is null
and max(u.ap_id) is null
)
"""
|> Repo.query([], timeout: :infinity)
end end
prune_hashtags_query = """ """
DELETE FROM hashtags AS ht DELETE FROM hashtags AS ht
WHERE NOT EXISTS ( WHERE NOT EXISTS (
SELECT 1 FROM hashtags_objects hto SELECT 1 FROM hashtags_objects hto
WHERE ht.id = hto.hashtag_id) WHERE ht.id = hto.hashtag_id)
""" """
|> Repo.query()
Repo.query(prune_hashtags_query)
if Keyword.get(options, :vacuum) do if Keyword.get(options, :vacuum) do
Maintenance.vacuum("full") Maintenance.vacuum("full")

View file

@ -354,7 +354,7 @@ defmodule Mix.Tasks.Pleroma.DatabaseTest do
assert length(Repo.all(Object)) == 1 assert length(Repo.all(Object)) == 1
end end
test "We don't have unexpected tables which can contain objects that are referenced by activities" do test "We don't have unexpected tables which may contain objects that are referenced by activities" do
# We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table. # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table.
# If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we
# add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities. # add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities.
@ -481,6 +481,69 @@ defmodule Mix.Tasks.Pleroma.DatabaseTest do
assert length(activities) == 4 assert length(activities) == 4
end end
test "it prunes orphaned activities with the --prune-orphaned-activities when the objects are referenced from an array" do
%Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert()
%User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert()
# Multiple objects, one object exists (keep)
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_existing_object",
"object" => ["non_ existing_object", "existing_object"]
}
})
|> Repo.insert()
# Multiple objects, one actor exists (keep)
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_existing_actor",
"object" => ["non_ existing_object", "existing_actor"]
}
})
|> Repo.insert()
# Multiple objects, one activity exists (keep)
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_existing_activity",
"object" => ["non_ existing_object", "remote_activity_existing_actor"]
}
})
|> Repo.insert()
# Multiple objects none exist (prune)
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_without_existing_referenced_object",
"object" => ["owo", "whats_this"]
}
})
|> Repo.insert()
assert length(Repo.all(Activity)) == 4
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
assert length(Repo.all(Activity)) == 4
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"])
activities = Repo.all(Activity)
assert length(activities) == 3
assert "remote_activity_without_existing_referenced_object" not in Enum.map(
activities,
fn a -> a.data["id"] end
)
assert length(activities) == 3
end
end end
describe "running update_users_following_followers_counts" do describe "running update_users_following_followers_counts" do