RichMedia refactor

Rich Media parsing was previously handled on-demand with a 2 second HTTP request timeout and retained only in Cachex. Every time a Pleroma instance is restarted it will have to request and parse the data for each status with a URL detected. When fetching a batch of statuses they were processed in parallel to attempt to keep the maximum latency at 2 seconds, but often resulted in a timeline appearing to hang during loading due to a URL that could not be successfully reached. URLs which had images links that expire (Amazon AWS) were parsed and inserted with a TTL to ensure the image link would not break.

Rich Media data is now cached in the database and fetched asynchronously. Cachex is used as a read-through cache. When the data becomes available we stream an update to the clients. If the result is returned quickly the experience is almost seamless. Activities were already processed for their Rich Media data during ingestion to warm the cache, so users should not normally encounter the asynchronous loading of the Rich Media data.

Implementation notes:

- The async worker is a Task with a globally unique process name to prevent duplicate processing of the same URL
- The Task will attempt to fetch the data 3 times with increasing sleep time between attempts
- The HTTP request obeys the default HTTP request timeout value instead of 2 seconds
- URLs that cannot be successfully parsed due to an unexpected error receives a negative cache entry for 15 minutes
- URLs that fail with an expected error will receive a negative cache with no TTL
- Activities that have no detected URLs insert a nil value in the Cachex :scrubber_cache so we do not repeat parsing the object content with Floki every time the activity is rendered
- Expiring image URLs are handled with an Oban job
- There is no automatic cleanup of the Rich Media data in the database, but it is safe to delete at any time
- The post draft/preview feature makes the URL processing synchronous so the rendered post preview will have an accurate rendering

Overall performance of timelines and creating new posts which contain URLs is greatly improved.
This commit is contained in:
Mark Felder 2024-02-11 16:11:52 -05:00 committed by Floatingghost
parent a924e117fd
commit 5da9cbd8a5
16 changed files with 431 additions and 240 deletions

View file

@ -0,0 +1 @@
Refactored Rich Media to cache the content in the database. Fetching operations that could block status rendering have been eliminated.

View file

@ -579,7 +579,8 @@ config :pleroma, Oban,
mute_expire: 5, mute_expire: 5,
search_indexing: 10, search_indexing: 10,
nodeinfo_fetcher: 1, nodeinfo_fetcher: 1,
database_prune: 1 database_prune: 1,
rich_media_expiration: 2
], ],
plugins: [ plugins: [
Oban.Plugins.Pruner, Oban.Plugins.Pruner,

View file

@ -141,6 +141,8 @@ config :phoenix, :plug_init_mode, :runtime
config :pleroma, :instances_favicons, enabled: false config :pleroma, :instances_favicons, enabled: false
config :pleroma, :instances_nodeinfo, enabled: false config :pleroma, :instances_nodeinfo, enabled: false
config :pleroma, Pleroma.Web.RichMedia.Backfill, provider: Pleroma.Web.RichMedia.Backfill
if File.exists?("./config/test.secret.exs") do if File.exists?("./config/test.secret.exs") do
import_config "test.secret.exs" import_config "test.secret.exs"
else else

View file

@ -67,22 +67,9 @@ defmodule Pleroma.HTML do
end end
end end
def extract_first_external_url_from_object(%{data: %{"content" => content}} = object) @spec extract_first_external_url_from_object(Pleroma.Object.t()) :: String.t() | nil
def extract_first_external_url_from_object(%{data: %{"content" => content}})
when is_binary(content) do when is_binary(content) do
unless object.data["fake"] do
key = "URL|#{object.id}"
@cachex.fetch!(:scrubber_cache, key, fn _key ->
{:commit, {:ok, extract_first_external_url(content)}}
end)
else
{:ok, extract_first_external_url(content)}
end
end
def extract_first_external_url_from_object(_), do: {:error, :no_content}
def extract_first_external_url(content) do
content content
|> Floki.parse_fragment!() |> Floki.parse_fragment!()
|> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])") |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])")
@ -90,4 +77,6 @@ defmodule Pleroma.HTML do
|> Floki.attribute("href") |> Floki.attribute("href")
|> Enum.at(0) |> Enum.at(0)
end end
def extract_first_external_url_from_object(_), do: nil
end end

View file

@ -155,9 +155,7 @@ defmodule Pleroma.Web.ActivityPub.ActivityPub do
# Splice in the child object if we have one. # Splice in the child object if we have one.
activity = Maps.put_if_present(activity, :object, object) activity = Maps.put_if_present(activity, :object, object)
ConcurrentLimiter.limit(Pleroma.Web.RichMedia.Helpers, fn -> Pleroma.Web.RichMedia.Card.get_by_activity(activity)
Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end)
end)
# Add local posts to search index # Add local posts to search index
if local, do: Pleroma.Search.add_to_index(activity) if local, do: Pleroma.Search.add_to_index(activity)
@ -185,7 +183,7 @@ defmodule Pleroma.Web.ActivityPub.ActivityPub do
id: "pleroma:fakeid" id: "pleroma:fakeid"
} }
Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) Pleroma.Web.RichMedia.Card.get_by_activity(activity)
{:ok, activity} {:ok, activity}
{:remote_limit_pass, _} -> {:remote_limit_pass, _} ->

View file

@ -225,9 +225,7 @@ defmodule Pleroma.Web.ActivityPub.SideEffects do
end end
end end
ConcurrentLimiter.limit(Pleroma.Web.RichMedia.Helpers, fn -> Pleroma.Web.RichMedia.Card.get_by_activity(activity)
Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end)
end)
Pleroma.Search.add_to_index(Map.put(activity, :object, object)) Pleroma.Search.add_to_index(Map.put(activity, :object, object))

View file

@ -25,6 +25,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusController do
alias Pleroma.Web.OAuth.Token alias Pleroma.Web.OAuth.Token
alias Pleroma.Web.Plugs.OAuthScopesPlug alias Pleroma.Web.Plugs.OAuthScopesPlug
alias Pleroma.Web.Plugs.RateLimiter alias Pleroma.Web.Plugs.RateLimiter
alias Pleroma.Web.RichMedia.Card
plug(Pleroma.Web.ApiSpec.CastAndValidate) plug(Pleroma.Web.ApiSpec.CastAndValidate)
@ -383,6 +384,21 @@ defmodule Pleroma.Web.MastodonAPI.StatusController do
end end
end end
@doc "GET /api/v1/statuses/:id/card"
@deprecated "https://github.com/tootsuite/mastodon/pull/11213"
def card(
%{assigns: %{user: user}, private: %{open_api_spex: %{params: %{id: status_id}}}} = conn,
_
) do
with %Activity{} = activity <- Activity.get_by_id(status_id),
true <- Visibility.visible_for_user?(activity, user),
%Card{} = card_data <- Card.get_by_activity(activity) do
render(conn, "card.json", card_data)
else
_ -> render_error(conn, :not_found, "Record not found")
end
end
@doc "GET /api/v1/statuses/:id/favourited_by" @doc "GET /api/v1/statuses/:id/favourited_by"
def favourited_by(%{assigns: %{user: user}} = conn, %{id: id}) do def favourited_by(%{assigns: %{user: user}} = conn, %{id: id}) do
with true <- Pleroma.Config.get([:instance, :show_reactions]), with true <- Pleroma.Config.get([:instance, :show_reactions]),

View file

@ -22,6 +22,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
alias Pleroma.Web.MediaProxy alias Pleroma.Web.MediaProxy
alias Pleroma.Web.PleromaAPI.EmojiReactionController alias Pleroma.Web.PleromaAPI.EmojiReactionController
require Logger require Logger
alias Pleroma.Web.RichMedia.Card
import Pleroma.Web.ActivityPub.Visibility, only: [get_visibility: 1, visible_for_user?: 2] import Pleroma.Web.ActivityPub.Visibility, only: [get_visibility: 1, visible_for_user?: 2]
@ -30,9 +31,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
# pagination is restricted to 40 activities at a time # pagination is restricted to 40 activities at a time
defp fetch_rich_media_for_activities(activities) do defp fetch_rich_media_for_activities(activities) do
Enum.each(activities, fn activity -> Enum.each(activities, fn activity ->
spawn(fn -> spawn(fn -> Card.get_by_activity(activity) end)
Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
end)
end) end)
end end
@ -93,9 +92,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
# To do: check AdminAPIControllerTest on the reasons behind nil activities in the list # To do: check AdminAPIControllerTest on the reasons behind nil activities in the list
activities = Enum.filter(opts.activities, & &1) activities = Enum.filter(opts.activities, & &1)
# Start fetching rich media before doing anything else, so that later calls to get the cards # Start prefetching rich media before doing anything else
# only block for timeout in the worst case, as opposed to
# length(activities_with_links) * timeout
fetch_rich_media_for_activities(activities) fetch_rich_media_for_activities(activities)
replied_to_activities = get_replied_to_activities(activities) replied_to_activities = get_replied_to_activities(activities)
@ -301,6 +298,18 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
object object
|> render_content() |> render_content()
quote_post =
if visible_for_user?(quote_activity, opts[:for]) and opts[:show_quote] != false do
quote_rendering_opts = Map.merge(opts, %{activity: quote_activity, show_quote: false})
render("show.json", quote_rendering_opts)
else
nil
end
content =
object
|> render_content()
content_html = content_html =
content content
|> Activity.HTML.get_cached_scrubbed_html_for_activity( |> Activity.HTML.get_cached_scrubbed_html_for_activity(
@ -318,6 +327,64 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
summary = object.data["summary"] || "" summary = object.data["summary"] || ""
card =
case Card.get_by_activity(activity) do
%Card{} = result -> render("card.json", result)
_ -> nil
end
url =
if user.local do
Pleroma.Web.Router.Helpers.o_status_url(Pleroma.Web.Endpoint, :notice, activity)
else
object.data["url"] || object.data["external_url"] || object.data["id"]
end
direct_conversation_id =
with {_, nil} <- {:direct_conversation_id, opts[:direct_conversation_id]},
{_, true} <- {:include_id, opts[:with_direct_conversation_id]},
{_, %User{} = for_user} <- {:for_user, opts[:for]} do
Activity.direct_conversation_id(activity, for_user)
else
{:direct_conversation_id, participation_id} when is_integer(participation_id) ->
participation_id
_e ->
nil
end
emoji_reactions =
object
|> Object.get_emoji_reactions()
|> EmojiReactionController.filter_allowed_users(
opts[:for],
Map.get(opts, :with_muted, false)
)
|> Stream.map(fn {emoji, users, url} ->
build_emoji_map(emoji, users, url, opts[:for])
end)
|> Enum.to_list()
# Status muted state (would do 1 request per status unless user mutes are preloaded)
muted =
thread_muted? ||
UserRelationship.exists?(
get_in(opts, [:relationships, :user_relationships]),
:mute,
opts[:for],
user,
fn for_user, user -> User.mutes?(for_user, user) end
)
content_plaintext =
content
|> Activity.HTML.get_cached_stripped_html_for_activity(
activity,
"mastoapi:content:#{chrono_order}"
)
summary = object.data["summary"] || ""
card = render("card.json", Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)) card = render("card.json", Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity))
url = url =
@ -528,15 +595,8 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
} }
end end
def render("card.json", %{rich_media: rich_media, page_url: page_url}) do def render("card.json", %Card{fields: rich_media}) do
page_url_data = URI.parse(page_url) page_url_data = URI.parse(rich_media["url"])
page_url_data =
if is_binary(rich_media["url"]) do
URI.merge(page_url_data, URI.parse(rich_media["url"]))
else
page_url_data
end
page_url = page_url_data |> to_string page_url = page_url_data |> to_string

View file

@ -0,0 +1,101 @@
# Pleroma: A lightweight social networking server
# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Backfill.Task do
alias Pleroma.Web.RichMedia.Backfill
def run(args) do
Task.Supervisor.start_child(Pleroma.TaskSupervisor, Backfill, :run, [args],
name: {:global, {:rich_media, args.url_hash}}
)
end
end
defmodule Pleroma.Web.RichMedia.Backfill do
alias Pleroma.Web.RichMedia.Card
alias Pleroma.Web.RichMedia.Parser
alias Pleroma.Web.RichMedia.Parser.TTL
alias Pleroma.Workers.RichMediaExpirationWorker
require Logger
@backfiller Pleroma.Config.get([__MODULE__, :provider], Pleroma.Web.RichMedia.Backfill.Task)
@cachex Pleroma.Config.get([:cachex, :provider], Cachex)
@max_attempts 3
@retry 5_000
def start(%{url: url} = args) when is_binary(url) do
url_hash = Card.url_to_hash(url)
args =
args
|> Map.put(:attempt, 1)
|> Map.put(:url_hash, url_hash)
@backfiller.run(args)
end
def run(%{url: url, url_hash: url_hash, attempt: attempt} = args)
when attempt <= @max_attempts do
case Parser.parse(url) do
{:ok, fields} ->
{:ok, card} = Card.create(url, fields)
maybe_schedule_expiration(url, fields)
if Map.has_key?(args, :activity_id) do
stream_update(args)
end
warm_cache(url_hash, card)
{:error, {:invalid_metadata, fields}} ->
Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}")
negative_cache(url_hash)
{:error, :body_too_large} ->
Logger.error("Rich media error for #{url}: :body_too_large")
negative_cache(url_hash)
{:error, {:content_type, type}} ->
Logger.debug("Rich media error for #{url}: :content_type is #{type}")
negative_cache(url_hash)
e ->
Logger.debug("Rich media error for #{url}: #{inspect(e)}")
:timer.sleep(@retry * attempt)
run(%{args | attempt: attempt + 1})
end
end
def run(%{url: url, url_hash: url_hash}) do
Logger.debug("Rich media failure for #{url}")
negative_cache(url_hash, :timer.minutes(15))
end
defp maybe_schedule_expiration(url, fields) do
case TTL.get_from_image(fields, url) do
ttl when is_number(ttl) ->
timestamp = DateTime.from_unix!(ttl)
RichMediaExpirationWorker.new(%{"url" => url}, scheduled_at: timestamp)
|> Oban.insert()
_ ->
:ok
end
end
defp stream_update(%{activity_id: activity_id}) do
Pleroma.Activity.get_by_id(activity_id)
|> Pleroma.Activity.normalize()
|> Pleroma.Web.ActivityPub.ActivityPub.stream_out()
end
defp warm_cache(key, val), do: @cachex.put(:rich_media_cache, key, val)
defp negative_cache(key, ttl \\ nil), do: @cachex.put(:rich_media_cache, key, nil, ttl: ttl)
end

View file

@ -0,0 +1,157 @@
defmodule Pleroma.Web.RichMedia.Card do
use Ecto.Schema
import Ecto.Changeset
import Ecto.Query
alias Pleroma.Activity
alias Pleroma.HTML
alias Pleroma.Object
alias Pleroma.Repo
alias Pleroma.Web.RichMedia.Backfill
alias Pleroma.Web.RichMedia.Parser
@cachex Pleroma.Config.get([:cachex, :provider], Cachex)
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
@type t :: %__MODULE__{}
schema "rich_media_card" do
field(:url_hash, :binary)
field(:fields, :map)
timestamps()
end
@doc false
def changeset(card, attrs) do
card
|> cast(attrs, [:url_hash, :fields])
|> validate_required([:url_hash, :fields])
|> unique_constraint(:url_hash)
end
@spec create(String.t(), map()) :: {:ok, t()}
def create(url, fields) do
url_hash = url_to_hash(url)
fields = Map.put_new(fields, "url", url)
%__MODULE__{}
|> changeset(%{url_hash: url_hash, fields: fields})
|> Repo.insert(on_conflict: {:replace, [:fields]}, conflict_target: :url_hash)
end
@spec delete(String.t()) :: {:ok, Ecto.Schema.t()} | {:error, Ecto.Changeset.t()} | :ok
def delete(url) do
url_hash = url_to_hash(url)
@cachex.del(:rich_media_cache, url_hash)
case get_by_url(url) do
%__MODULE{} = card -> Repo.delete(card)
nil -> :ok
end
end
@spec get_by_url(String.t() | nil) :: t() | nil | :error
def get_by_url(url) when is_binary(url) do
if @config_impl.get([:rich_media, :enabled]) do
url_hash = url_to_hash(url)
@cachex.fetch!(:rich_media_cache, url_hash, fn _ ->
result =
__MODULE__
|> where(url_hash: ^url_hash)
|> Repo.one()
case result do
%__MODULE__{} = card -> {:commit, card}
_ -> {:ignore, nil}
end
end)
else
:error
end
end
def get_by_url(nil), do: nil
@spec get_or_backfill_by_url(String.t(), map()) :: t() | nil
def get_or_backfill_by_url(url, backfill_opts \\ %{}) do
case get_by_url(url) do
%__MODULE__{} = card ->
card
nil ->
backfill_opts = Map.put(backfill_opts, :url, url)
Backfill.start(backfill_opts)
nil
:error ->
nil
end
end
@spec get_by_object(Object.t()) :: t() | nil | :error
def get_by_object(object) do
case HTML.extract_first_external_url_from_object(object) do
nil -> nil
url -> get_or_backfill_by_url(url)
end
end
@spec get_by_activity(Activity.t()) :: t() | nil | :error
# Fake/Draft activity
def get_by_activity(%Activity{id: "pleroma:fakeid"} = activity) do
with %Object{} = object <- Object.normalize(activity, fetch: false),
url when not is_nil(url) <- HTML.extract_first_external_url_from_object(object) do
case get_by_url(url) do
# Cache hit
%__MODULE__{} = card ->
card
# Cache miss, but fetch for rendering the Draft
_ ->
with {:ok, fields} <- Parser.parse(url),
{:ok, card} <- create(url, fields) do
card
else
_ -> nil
end
end
else
_ ->
nil
end
end
def get_by_activity(activity) do
with %Object{} = object <- Object.normalize(activity, fetch: false),
{_, nil} <- {:cached, get_cached_url(object, activity.id)} do
nil
else
{:cached, url} ->
get_or_backfill_by_url(url, %{activity_id: activity.id})
_ ->
:error
end
end
@spec url_to_hash(String.t()) :: String.t()
def url_to_hash(url) do
:crypto.hash(:sha256, url) |> Base.encode16(case: :lower)
end
defp get_cached_url(object, activity_id) do
key = "URL|#{activity_id}"
@cachex.fetch!(:scrubber_cache, key, fn _ ->
url = HTML.extract_first_external_url_from_object(object)
Activity.HTML.add_cache_key_for(activity_id, key)
{:commit, url}
end)
end
end

View file

@ -3,85 +3,13 @@
# SPDX-License-Identifier: AGPL-3.0-only # SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Helpers do defmodule Pleroma.Web.RichMedia.Helpers do
alias Pleroma.Activity
alias Pleroma.Config alias Pleroma.Config
alias Pleroma.HTML
alias Pleroma.Object
alias Pleroma.Web.RichMedia.Parser
@options [
max_body: 2_000_000,
receive_timeout: 2_000
]
@spec validate_page_url(URI.t() | binary()) :: :ok | :error
defp validate_page_url(page_url) when is_binary(page_url) do
validate_tld = Config.get([Pleroma.Formatter, :validate_tld])
page_url
|> Linkify.Parser.url?(validate_tld: validate_tld)
|> parse_uri(page_url)
end
defp validate_page_url(%URI{host: host, scheme: "https", authority: authority})
when is_binary(authority) do
cond do
host in Config.get([:rich_media, :ignore_hosts], []) ->
:error
get_tld(host) in Config.get([:rich_media, :ignore_tld], []) ->
:error
true ->
:ok
end
end
defp validate_page_url(_), do: :error
defp parse_uri(true, url) do
url
|> URI.parse()
|> validate_page_url
end
defp parse_uri(_, _), do: :error
defp get_tld(host) do
host
|> String.split(".")
|> Enum.reverse()
|> hd
end
def fetch_data_for_object(object) do
with true <- Config.get([:rich_media, :enabled]),
{:ok, page_url} <-
HTML.extract_first_external_url_from_object(object),
:ok <- validate_page_url(page_url),
{:ok, rich_media} <- Parser.parse(page_url) do
%{page_url: page_url, rich_media: rich_media}
else
_ -> %{}
end
end
def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do
with true <- Config.get([:rich_media, :enabled]),
%Object{} = object <- Object.normalize(activity, fetch: false) do
fetch_data_for_object(object)
else
_ -> %{}
end
end
def fetch_data_for_activity(_), do: %{}
def rich_media_get(url) do def rich_media_get(url) do
headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}] headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
head_check = head_check =
case Pleroma.HTTP.head(url, headers, @options) do case Pleroma.HTTP.head(url, headers, http_options()) do
# If the HEAD request didn't reach the server for whatever reason, # If the HEAD request didn't reach the server for whatever reason,
# we assume the GET that comes right after won't either # we assume the GET that comes right after won't either
{:error, _} = e -> {:error, _} = e ->
@ -96,7 +24,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do
:ok :ok
end end
with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, @options) with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options())
end end
defp check_content_type(headers) do defp check_content_type(headers) do
@ -112,12 +40,13 @@ defmodule Pleroma.Web.RichMedia.Helpers do
end end
end end
@max_body @options[:max_body]
defp check_content_length(headers) do defp check_content_length(headers) do
max_body = Keyword.get(http_options(), :max_body)
case List.keyfind(headers, "content-length", 0) do case List.keyfind(headers, "content-length", 0) do
{_, maybe_content_length} -> {_, maybe_content_length} ->
case Integer.parse(maybe_content_length) do case Integer.parse(maybe_content_length) do
{content_length, ""} when content_length <= @max_body -> :ok {content_length, ""} when content_length <= max_body -> :ok
{_, ""} -> {:error, :body_too_large} {_, ""} -> {:error, :body_too_large}
_ -> :ok _ -> :ok
end end
@ -126,4 +55,11 @@ defmodule Pleroma.Web.RichMedia.Helpers do
:ok :ok
end end
end end
defp http_options() do
[
pool: :media,
max_body: Config.get([:rich_media, :max_body], 2_000_000)
]
end
end end

View file

@ -5,137 +5,28 @@
defmodule Pleroma.Web.RichMedia.Parser do defmodule Pleroma.Web.RichMedia.Parser do
require Logger require Logger
@cachex Pleroma.Config.get([:cachex, :provider], Cachex) @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
defp parsers do defp parsers do
Pleroma.Config.get([:rich_media, :parsers]) Pleroma.Config.get([:rich_media, :parsers])
end end
def parse(nil), do: {:error, "No URL provided"} def parse(nil), do: nil
if Pleroma.Config.get(:env) == :test do
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url), do: parse_with_timeout(url)
else
@spec parse(String.t()) :: {:ok, map()} | {:error, any()} @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url) do def parse(url) do
with {:ok, data} <- get_cached_or_parse(url), with :ok <- validate_page_url(url),
{:ok, _} <- set_ttl_based_on_image(data, url) do {:ok, data} <- parse_url(url) do
data = Map.put(data, "url", url)
{:ok, data} {:ok, data}
end end
end end
defp get_cached_or_parse(url) do defp parse_url(url) do
case @cachex.fetch(:rich_media_cache, url, fn ->
case parse_with_timeout(url) do
{:ok, _} = res ->
{:commit, res}
{:error, reason} = e ->
# Unfortunately we have to log errors here, instead of doing that
# along with ttl setting at the bottom. Otherwise we can get log spam
# if more than one process was waiting for the rich media card
# while it was generated. Ideally we would set ttl here as well,
# so we don't override it number_of_waiters_on_generation
# times, but one, obviously, can't set ttl for not-yet-created entry
# and Cachex doesn't support returning ttl from the fetch callback.
log_error(url, reason)
{:commit, e}
end
end) do
{action, res} when action in [:commit, :ok] ->
case res do
{:ok, _data} = res ->
res
{:error, reason} = e ->
if action == :commit, do: set_error_ttl(url, reason)
e
end
{:error, e} ->
{:error, {:cachex_error, e}}
end
end
defp set_error_ttl(_url, :body_too_large), do: :ok
defp set_error_ttl(_url, {:content_type, _}), do: :ok
# The TTL is not set for the errors above, since they are unlikely to change
# with time
defp set_error_ttl(url, _reason) do
ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000)
@cachex.expire(:rich_media_cache, url, ttl)
:ok
end
defp log_error(url, {:invalid_metadata, data}) do
Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end)
end
defp log_error(url, reason) do
Logger.warning(fn -> "Rich media error for #{url}: #{inspect(reason)}" end)
end
end
@doc """
Set the rich media cache based on the expiration time of image.
Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL`
## Example
defmodule MyModule do
@behaviour Pleroma.Web.RichMedia.Parser.TTL
def ttl(data, url) do
image_url = Map.get(data, :image)
# do some parsing in the url and get the ttl of the image
# and return ttl is unix time
parse_ttl_from_url(image_url)
end
end
Define the module in the config
config :pleroma, :rich_media,
ttl_setters: [MyModule]
"""
@spec set_ttl_based_on_image(map(), String.t()) ::
{:ok, Integer.t() | :noop} | {:error, :no_key}
def set_ttl_based_on_image(data, url) do
case get_ttl_from_image(data, url) do
{:ok, ttl} when is_number(ttl) ->
ttl = ttl * 1000
case @cachex.expire_at(:rich_media_cache, url, ttl) do
{:ok, true} -> {:ok, ttl}
{:ok, false} -> {:error, :no_key}
end
_ ->
{:ok, :noop}
end
end
defp get_ttl_from_image(data, url) do
[:rich_media, :ttl_setters]
|> Pleroma.Config.get()
|> Enum.reduce({:ok, nil}, fn
module, {:ok, _ttl} ->
module.ttl(data, url)
_, error ->
error
end)
end
def parse_url(url) do
with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url), with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
{:ok, html} <- Floki.parse_document(html) do {:ok, html} <- Floki.parse_document(html) do
html html
|> maybe_parse() |> maybe_parse()
|> Map.put("url", url)
|> clean_parsed_data() |> clean_parsed_data()
|> check_parsed_data() |> check_parsed_data()
end end

View file

@ -3,5 +3,17 @@
# SPDX-License-Identifier: AGPL-3.0-only # SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parser.TTL do defmodule Pleroma.Web.RichMedia.Parser.TTL do
@callback ttl(Map.t(), String.t()) :: Integer.t() | nil @callback ttl(map(), String.t()) :: integer() | nil
def get_from_image(data, url) do
[:rich_media, :ttl_setters]
|> Pleroma.Config.get()
|> Enum.reduce({:ok, nil}, fn
module, {:ok, _ttl} ->
module.ttl(data, url)
_, error ->
error
end)
end
end end

View file

@ -7,7 +7,7 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do
@impl true @impl true
def ttl(data, _url) do def ttl(data, _url) do
image = Map.get(data, :image) image = Map.get(data, "image")
if is_aws_signed_url(image) do if is_aws_signed_url(image) do
image image

View file

@ -0,0 +1,15 @@
# Pleroma: A lightweight social networking server
# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Workers.RichMediaExpirationWorker do
alias Pleroma.Web.RichMedia.Card
use Oban.Worker,
queue: :rich_media_expiration
@impl Oban.Worker
def perform(%Job{args: %{"url" => url} = _args}) do
Card.delete(url)
end
end

View file

@ -0,0 +1,14 @@
defmodule Pleroma.Repo.Migrations.CreateRichMediaCard do
use Ecto.Migration
def change do
create table(:rich_media_card) do
add(:url_hash, :bytea)
add(:fields, :map)
timestamps()
end
create(unique_index(:rich_media_card, [:url_hash]))
end
end