akkoma/lib/pleroma/web/rich_media/card.ex

defmodule Pleroma.Web.RichMedia.Card do
  use Ecto.Schema
  import Ecto.Changeset
  import Ecto.Query

  alias Pleroma.Activity
  alias Pleroma.HTML
  alias Pleroma.Object
  alias Pleroma.Repo
  alias Pleroma.Web.RichMedia.Backfill
  alias Pleroma.Web.RichMedia.Parser

  @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
  @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)

  @type t :: %__MODULE__{}

  schema "rich_media_card" do
    field(:url_hash, :binary)
    field(:fields, :map)

    timestamps()
  end

  @doc false
  def changeset(card, attrs) do
    card
    |> cast(attrs, [:url_hash, :fields])
    |> validate_required([:url_hash, :fields])
    |> unique_constraint(:url_hash)
  end

  @spec create(String.t(), map()) :: {:ok, t()}
  def create(url, fields) do
    url_hash = url_to_hash(url)

    fields = Map.put_new(fields, "url", url)

    %__MODULE__{}
    |> changeset(%{url_hash: url_hash, fields: fields})
    |> Repo.insert(on_conflict: {:replace, [:fields]}, conflict_target: :url_hash)
  end

  @spec delete(String.t()) :: {:ok, Ecto.Schema.t()} | {:error, Ecto.Changeset.t()} | :ok
  def delete(url) do
    url_hash = url_to_hash(url)
    @cachex.del(:rich_media_cache, url_hash)

    case get_by_url(url) do
      %__MODULE{} = card -> Repo.delete(card)
      nil -> :ok
    end
  end

  @spec get_by_url(String.t() | nil) :: t() | nil | :error
  def get_by_url(url) when is_binary(url) do
    if @config_impl.get([:rich_media, :enabled]) do
      url_hash = url_to_hash(url)

      @cachex.fetch!(:rich_media_cache, url_hash, fn _ ->
        result =
          __MODULE__
          |> where(url_hash: ^url_hash)
          |> Repo.one()

        case result do
          %__MODULE__{} = card -> {:commit, card}
          _ -> {:ignore, nil}
        end
      end)
    else
      :error
    end
  end

  def get_by_url(nil), do: nil

  @spec get_or_backfill_by_url(String.t(), map()) :: t() | nil
  def get_or_backfill_by_url(url, backfill_opts \\ %{}) do
    case get_by_url(url) do
      %__MODULE__{} = card ->
        card

      nil ->
        backfill_opts = Map.put(backfill_opts, :url, url)

        Backfill.start(backfill_opts)

        nil

      :error ->
        nil
    end
  end

  @spec get_by_object(Object.t()) :: t() | nil | :error
  def get_by_object(object) do
    case HTML.extract_first_external_url_from_object(object) do
      nil -> nil
      url -> get_or_backfill_by_url(url)
    end
  end

  @spec get_by_activity(Activity.t()) :: t() | nil | :error
  # Fake/Draft activity
  def get_by_activity(%Activity{id: "pleroma:fakeid"} = activity) do
    with %Object{} = object <- Object.normalize(activity, fetch: false),
         url when not is_nil(url) <- HTML.extract_first_external_url_from_object(object) do
      case get_by_url(url) do
        # Cache hit
        %__MODULE__{} = card ->
          card

        # Cache miss, but fetch for rendering the Draft
        _ ->
          with {:ok, fields} <- Parser.parse(url),
               {:ok, card} <- create(url, fields) do
            card
          else
            _ -> nil
          end
      end
    else
      _ ->
        nil
    end
  end

  def get_by_activity(activity) do
    with %Object{} = object <- Object.normalize(activity, fetch: false),
         {_, nil} <- {:cached, get_cached_url(object, activity.id)} do
      nil
    else
      {:cached, url} ->
        get_or_backfill_by_url(url, %{activity_id: activity.id})

      _ ->
        :error
    end
  end

  @spec url_to_hash(String.t()) :: String.t()
  def url_to_hash(url) do
    :crypto.hash(:sha256, url) |> Base.encode16(case: :lower)
  end

  defp get_cached_url(object, activity_id) do
    key = "URL|#{activity_id}"

    @cachex.fetch!(:scrubber_cache, key, fn _ ->
      url = HTML.extract_first_external_url_from_object(object)
      Activity.HTML.add_cache_key_for(activity_id, key)

      {:commit, url}
    end)
  end
end
RichMedia refactor Rich Media parsing was previously handled on-demand with a 2 second HTTP request timeout and retained only in Cachex. Every time a Pleroma instance is restarted it will have to request and parse the data for each status with a URL detected. When fetching a batch of statuses they were processed in parallel to attempt to keep the maximum latency at 2 seconds, but often resulted in a timeline appearing to hang during loading due to a URL that could not be successfully reached. URLs which had images links that expire (Amazon AWS) were parsed and inserted with a TTL to ensure the image link would not break. Rich Media data is now cached in the database and fetched asynchronously. Cachex is used as a read-through cache. When the data becomes available we stream an update to the clients. If the result is returned quickly the experience is almost seamless. Activities were already processed for their Rich Media data during ingestion to warm the cache, so users should not normally encounter the asynchronous loading of the Rich Media data. Implementation notes: - The async worker is a Task with a globally unique process name to prevent duplicate processing of the same URL - The Task will attempt to fetch the data 3 times with increasing sleep time between attempts - The HTTP request obeys the default HTTP request timeout value instead of 2 seconds - URLs that cannot be successfully parsed due to an unexpected error receives a negative cache entry for 15 minutes - URLs that fail with an expected error will receive a negative cache with no TTL - Activities that have no detected URLs insert a nil value in the Cachex :scrubber_cache so we do not repeat parsing the object content with Floki every time the activity is rendered - Expiring image URLs are handled with an Oban job - There is no automatic cleanup of the Rich Media data in the database, but it is safe to delete at any time - The post draft/preview feature makes the URL processing synchronous so the rendered post preview will have an accurate rendering Overall performance of timelines and creating new posts which contain URLs is greatly improved. 2024-02-11 14:11:52 -07:00			`defmodule Pleroma.Web.RichMedia.Card do`
			`use Ecto.Schema`
			`import Ecto.Changeset`
			`import Ecto.Query`

			`alias Pleroma.Activity`
			`alias Pleroma.HTML`
			`alias Pleroma.Object`
			`alias Pleroma.Repo`
			`alias Pleroma.Web.RichMedia.Backfill`
			`alias Pleroma.Web.RichMedia.Parser`

			`@cachex Pleroma.Config.get([:cachex, :provider], Cachex)`
			`@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)`

			`@type t :: %__MODULE__{}`

			`schema "rich_media_card" do`
			`field(:url_hash, :binary)`
			`field(:fields, :map)`

			`timestamps()`
			`end`

			`@doc false`
			`def changeset(card, attrs) do`
			`card`
			`\|> cast(attrs, [:url_hash, :fields])`
			`\|> validate_required([:url_hash, :fields])`
			`\|> unique_constraint(:url_hash)`
			`end`

			`@spec create(String.t(), map()) :: {:ok, t()}`
			`def create(url, fields) do`
			`url_hash = url_to_hash(url)`

			`fields = Map.put_new(fields, "url", url)`

			`%__MODULE__{}`
			`\|> changeset(%{url_hash: url_hash, fields: fields})`
			`\|> Repo.insert(on_conflict: {:replace, [:fields]}, conflict_target: :url_hash)`
			`end`

			`@spec delete(String.t()) :: {:ok, Ecto.Schema.t()} \| {:error, Ecto.Changeset.t()} \| :ok`
			`def delete(url) do`
			`url_hash = url_to_hash(url)`
			`@cachex.del(:rich_media_cache, url_hash)`

			`case get_by_url(url) do`
			`%__MODULE{} = card -> Repo.delete(card)`
			`nil -> :ok`
			`end`
			`end`

			`@spec get_by_url(String.t() \| nil) :: t() \| nil \| :error`
			`def get_by_url(url) when is_binary(url) do`
			`if @config_impl.get([:rich_media, :enabled]) do`
			`url_hash = url_to_hash(url)`

			`@cachex.fetch!(:rich_media_cache, url_hash, fn _ ->`
			`result =`
			`__MODULE__`
			`\|> where(url_hash: ^url_hash)`
			`\|> Repo.one()`

			`case result do`
			`%__MODULE__{} = card -> {:commit, card}`
			`_ -> {:ignore, nil}`
			`end`
			`end)`
			`else`
			`:error`
			`end`
			`end`

			`def get_by_url(nil), do: nil`

			`@spec get_or_backfill_by_url(String.t(), map()) :: t() \| nil`
			`def get_or_backfill_by_url(url, backfill_opts \\ %{}) do`
			`case get_by_url(url) do`
			`%__MODULE__{} = card ->`
			`card`

			`nil ->`
			`backfill_opts = Map.put(backfill_opts, :url, url)`

			`Backfill.start(backfill_opts)`

			`nil`

			`:error ->`
			`nil`
			`end`
			`end`

			`@spec get_by_object(Object.t()) :: t() \| nil \| :error`
			`def get_by_object(object) do`
			`case HTML.extract_first_external_url_from_object(object) do`
			`nil -> nil`
			`url -> get_or_backfill_by_url(url)`
			`end`
			`end`

			`@spec get_by_activity(Activity.t()) :: t() \| nil \| :error`
			`# Fake/Draft activity`
			`def get_by_activity(%Activity{id: "pleroma:fakeid"} = activity) do`
			`with %Object{} = object <- Object.normalize(activity, fetch: false),`
			`url when not is_nil(url) <- HTML.extract_first_external_url_from_object(object) do`
			`case get_by_url(url) do`
			`# Cache hit`
			`%__MODULE__{} = card ->`
			`card`

			`# Cache miss, but fetch for rendering the Draft`
			`_ ->`
			`with {:ok, fields} <- Parser.parse(url),`
			`{:ok, card} <- create(url, fields) do`
			`card`
			`else`
			`_ -> nil`
			`end`
			`end`
			`else`
			`_ ->`
			`nil`
			`end`
			`end`

			`def get_by_activity(activity) do`
			`with %Object{} = object <- Object.normalize(activity, fetch: false),`
			`{_, nil} <- {:cached, get_cached_url(object, activity.id)} do`
			`nil`
			`else`
			`{:cached, url} ->`
			`get_or_backfill_by_url(url, %{activity_id: activity.id})`

			`_ ->`
			`:error`
			`end`
			`end`

			`@spec url_to_hash(String.t()) :: String.t()`
			`def url_to_hash(url) do`
			`:crypto.hash(:sha256, url) \|> Base.encode16(case: :lower)`
			`end`

			`defp get_cached_url(object, activity_id) do`
			`key = "URL\|#{activity_id}"`

			`@cachex.fetch!(:scrubber_cache, key, fn _ ->`
			`url = HTML.extract_first_external_url_from_object(object)`
			`Activity.HTML.add_cache_key_for(activity_id, key)`

			`{:commit, url}`
			`end)`
			`end`
			`end`