akkoma/lib/pleroma/web/rich_media/parser.ex

# Pleroma: A lightweight social networking server
# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
# SPDX-License-Identifier: AGPL-3.0-only

defmodule Pleroma.Web.RichMedia.Parser do
  require Logger

  @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)

  defp parsers do
    Pleroma.Config.get([:rich_media, :parsers])
  end

  def parse(nil), do: nil

  @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
  def parse(url) do
    with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])},
         :ok <- validate_page_url(url),
         {:ok, data} <- parse_url(url) do
      data = Map.put(data, "url", url)
      {:ok, data}
    else
      {:config, _} -> {:error, :rich_media_disabled}
      e -> e
    end
  end

  defp parse_url(url) do
    with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
         {:ok, html} <- Floki.parse_document(html) do
      html
      |> maybe_parse()
      |> clean_parsed_data()
      |> check_parsed_data()
    end
  end

  defp maybe_parse(html) do
    Enum.reduce_while(parsers(), %{}, fn parser, acc ->
      case parser.parse(html, acc) do
        data when data != %{} -> {:halt, data}
        _ -> {:cont, acc}
      end
    end)
  end

  defp check_parsed_data(%{"title" => title} = data)
       when is_binary(title) and title != "" do
    {:ok, data}
  end

  defp check_parsed_data(data) do
    {:error, {:invalid_metadata, data}}
  end

  defp clean_parsed_data(data) do
    data
    |> Enum.reject(fn {key, val} ->
      not match?({:ok, _}, Jason.encode(%{key => val}))
    end)
    |> Map.new()
  end

  @spec validate_page_url(URI.t() | binary()) :: :ok | :error
  defp validate_page_url(page_url) when is_binary(page_url) do
    validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld])

    page_url
    |> Linkify.Parser.url?(validate_tld: validate_tld)
    |> parse_uri(page_url)
  end

  defp validate_page_url(%URI{host: host, scheme: "https"}) do
    cond do
      Linkify.Parser.ip?(host) ->
        :error

      host in @config_impl.get([:rich_media, :ignore_hosts], []) ->
        :error

      get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) ->
        :error

      true ->
        :ok
    end
  end

  defp validate_page_url(_), do: :error

  defp parse_uri(true, url) do
    url
    |> URI.parse()
    |> validate_page_url
  end

  defp parse_uri(_, _), do: :error

  defp get_tld(host) do
    host
    |> String.split(".")
    |> Enum.reverse()
    |> hd
  end
end
rich media: parser: add copyright header 2019-01-28 12:59:36 -07:00			`# Pleroma: A lightweight social networking server`
Fix tests 2024-06-09 11:28:00 -06:00			`# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>`
rich media: parser: add copyright header 2019-01-28 12:59:36 -07:00			`# SPDX-License-Identifier: AGPL-3.0-only`

Add OGP parser 2019-01-01 13:26:40 -07:00			`defmodule Pleroma.Web.RichMedia.Parser do`
don't fail on url fetch 2020-09-01 10:12:45 -06:00			`require Logger`

RichMedia refactor Rich Media parsing was previously handled on-demand with a 2 second HTTP request timeout and retained only in Cachex. Every time a Pleroma instance is restarted it will have to request and parse the data for each status with a URL detected. When fetching a batch of statuses they were processed in parallel to attempt to keep the maximum latency at 2 seconds, but often resulted in a timeline appearing to hang during loading due to a URL that could not be successfully reached. URLs which had images links that expire (Amazon AWS) were parsed and inserted with a TTL to ensure the image link would not break. Rich Media data is now cached in the database and fetched asynchronously. Cachex is used as a read-through cache. When the data becomes available we stream an update to the clients. If the result is returned quickly the experience is almost seamless. Activities were already processed for their Rich Media data during ingestion to warm the cache, so users should not normally encounter the asynchronous loading of the Rich Media data. Implementation notes: - The async worker is a Task with a globally unique process name to prevent duplicate processing of the same URL - The Task will attempt to fetch the data 3 times with increasing sleep time between attempts - The HTTP request obeys the default HTTP request timeout value instead of 2 seconds - URLs that cannot be successfully parsed due to an unexpected error receives a negative cache entry for 15 minutes - URLs that fail with an expected error will receive a negative cache with no TTL - Activities that have no detected URLs insert a nil value in the Cachex :scrubber_cache so we do not repeat parsing the object content with Floki every time the activity is rendered - Expiring image URLs are handled with an Oban job - There is no automatic cleanup of the Rich Media data in the database, but it is safe to delete at any time - The post draft/preview feature makes the URL processing synchronous so the rendered post preview will have an accurate rendering Overall performance of timelines and creating new posts which contain URLs is greatly improved. 2024-02-11 14:11:52 -07:00			`@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)`
Cachex: Make caching provider switchable at runtime. Defaults to Cachex. 2020-12-18 09:44:46 -07:00
parsers configurable 2019-07-11 07:04:42 -06:00			`defp parsers do`
			`Pleroma.Config.get([:rich_media, :parsers])`
			`end`

RichMedia refactor Rich Media parsing was previously handled on-demand with a 2 second HTTP request timeout and retained only in Cachex. Every time a Pleroma instance is restarted it will have to request and parse the data for each status with a URL detected. When fetching a batch of statuses they were processed in parallel to attempt to keep the maximum latency at 2 seconds, but often resulted in a timeline appearing to hang during loading due to a URL that could not be successfully reached. URLs which had images links that expire (Amazon AWS) were parsed and inserted with a TTL to ensure the image link would not break. Rich Media data is now cached in the database and fetched asynchronously. Cachex is used as a read-through cache. When the data becomes available we stream an update to the clients. If the result is returned quickly the experience is almost seamless. Activities were already processed for their Rich Media data during ingestion to warm the cache, so users should not normally encounter the asynchronous loading of the Rich Media data. Implementation notes: - The async worker is a Task with a globally unique process name to prevent duplicate processing of the same URL - The Task will attempt to fetch the data 3 times with increasing sleep time between attempts - The HTTP request obeys the default HTTP request timeout value instead of 2 seconds - URLs that cannot be successfully parsed due to an unexpected error receives a negative cache entry for 15 minutes - URLs that fail with an expected error will receive a negative cache with no TTL - Activities that have no detected URLs insert a nil value in the Cachex :scrubber_cache so we do not repeat parsing the object content with Floki every time the activity is rendered - Expiring image URLs are handled with an Oban job - There is no automatic cleanup of the Rich Media data in the database, but it is safe to delete at any time - The post draft/preview feature makes the URL processing synchronous so the rendered post preview will have an accurate rendering Overall performance of timelines and creating new posts which contain URLs is greatly improved. 2024-02-11 14:11:52 -07:00			`def parse(nil), do: nil`
rich media: gracefully handle fetching nil URIs 2019-01-26 09:26:11 -07:00
RichMedia refactor Rich Media parsing was previously handled on-demand with a 2 second HTTP request timeout and retained only in Cachex. Every time a Pleroma instance is restarted it will have to request and parse the data for each status with a URL detected. When fetching a batch of statuses they were processed in parallel to attempt to keep the maximum latency at 2 seconds, but often resulted in a timeline appearing to hang during loading due to a URL that could not be successfully reached. URLs which had images links that expire (Amazon AWS) were parsed and inserted with a TTL to ensure the image link would not break. Rich Media data is now cached in the database and fetched asynchronously. Cachex is used as a read-through cache. When the data becomes available we stream an update to the clients. If the result is returned quickly the experience is almost seamless. Activities were already processed for their Rich Media data during ingestion to warm the cache, so users should not normally encounter the asynchronous loading of the Rich Media data. Implementation notes: - The async worker is a Task with a globally unique process name to prevent duplicate processing of the same URL - The Task will attempt to fetch the data 3 times with increasing sleep time between attempts - The HTTP request obeys the default HTTP request timeout value instead of 2 seconds - URLs that cannot be successfully parsed due to an unexpected error receives a negative cache entry for 15 minutes - URLs that fail with an expected error will receive a negative cache with no TTL - Activities that have no detected URLs insert a nil value in the Cachex :scrubber_cache so we do not repeat parsing the object content with Floki every time the activity is rendered - Expiring image URLs are handled with an Oban job - There is no automatic cleanup of the Rich Media data in the database, but it is safe to delete at any time - The post draft/preview feature makes the URL processing synchronous so the rendered post preview will have an accurate rendering Overall performance of timelines and creating new posts which contain URLs is greatly improved. 2024-02-11 14:11:52 -07:00			`@spec parse(String.t()) :: {:ok, map()} \| {:error, any()}`
			`def parse(url) do`
Fix tests 2024-06-09 11:28:00 -06:00			`with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])},`
			`:ok <- validate_page_url(url),`
RichMedia refactor Rich Media parsing was previously handled on-demand with a 2 second HTTP request timeout and retained only in Cachex. Every time a Pleroma instance is restarted it will have to request and parse the data for each status with a URL detected. When fetching a batch of statuses they were processed in parallel to attempt to keep the maximum latency at 2 seconds, but often resulted in a timeline appearing to hang during loading due to a URL that could not be successfully reached. URLs which had images links that expire (Amazon AWS) were parsed and inserted with a TTL to ensure the image link would not break. Rich Media data is now cached in the database and fetched asynchronously. Cachex is used as a read-through cache. When the data becomes available we stream an update to the clients. If the result is returned quickly the experience is almost seamless. Activities were already processed for their Rich Media data during ingestion to warm the cache, so users should not normally encounter the asynchronous loading of the Rich Media data. Implementation notes: - The async worker is a Task with a globally unique process name to prevent duplicate processing of the same URL - The Task will attempt to fetch the data 3 times with increasing sleep time between attempts - The HTTP request obeys the default HTTP request timeout value instead of 2 seconds - URLs that cannot be successfully parsed due to an unexpected error receives a negative cache entry for 15 minutes - URLs that fail with an expected error will receive a negative cache with no TTL - Activities that have no detected URLs insert a nil value in the Cachex :scrubber_cache so we do not repeat parsing the object content with Floki every time the activity is rendered - Expiring image URLs are handled with an Oban job - There is no automatic cleanup of the Rich Media data in the database, but it is safe to delete at any time - The post draft/preview feature makes the URL processing synchronous so the rendered post preview will have an accurate rendering Overall performance of timelines and creating new posts which contain URLs is greatly improved. 2024-02-11 14:11:52 -07:00			`{:ok, data} <- parse_url(url) do`
			`data = Map.put(data, "url", url)`
			`{:ok, data}`
Fix tests 2024-06-09 11:28:00 -06:00			`else`
			`{:config, _} -> {:error, :rich_media_disabled}`
			`e -> e`
RichMedia: Fix log spam on failures and resetting TTL on cached errors 2020-09-17 07:13:21 -06:00			`end`
rich media: disable cachex in test mode 2019-01-04 16:50:54 -07:00			`end`
Add OGP parser 2019-01-01 13:26:40 -07:00
RichMedia refactor Rich Media parsing was previously handled on-demand with a 2 second HTTP request timeout and retained only in Cachex. Every time a Pleroma instance is restarted it will have to request and parse the data for each status with a URL detected. When fetching a batch of statuses they were processed in parallel to attempt to keep the maximum latency at 2 seconds, but often resulted in a timeline appearing to hang during loading due to a URL that could not be successfully reached. URLs which had images links that expire (Amazon AWS) were parsed and inserted with a TTL to ensure the image link would not break. Rich Media data is now cached in the database and fetched asynchronously. Cachex is used as a read-through cache. When the data becomes available we stream an update to the clients. If the result is returned quickly the experience is almost seamless. Activities were already processed for their Rich Media data during ingestion to warm the cache, so users should not normally encounter the asynchronous loading of the Rich Media data. Implementation notes: - The async worker is a Task with a globally unique process name to prevent duplicate processing of the same URL - The Task will attempt to fetch the data 3 times with increasing sleep time between attempts - The HTTP request obeys the default HTTP request timeout value instead of 2 seconds - URLs that cannot be successfully parsed due to an unexpected error receives a negative cache entry for 15 minutes - URLs that fail with an expected error will receive a negative cache with no TTL - Activities that have no detected URLs insert a nil value in the Cachex :scrubber_cache so we do not repeat parsing the object content with Floki every time the activity is rendered - Expiring image URLs are handled with an Oban job - There is no automatic cleanup of the Rich Media data in the database, but it is safe to delete at any time - The post draft/preview feature makes the URL processing synchronous so the rendered post preview will have an accurate rendering Overall performance of timelines and creating new posts which contain URLs is greatly improved. 2024-02-11 14:11:52 -07:00			`defp parse_url(url) do`
don't fail on url fetch 2020-09-01 10:12:45 -06:00			`with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),`
			`{:ok, html} <- Floki.parse_document(html) do`
Replace missing non-nullable Card attributes with empty strings 2019-05-30 15:03:31 -06:00			`html`
			`\|> maybe_parse()`
			`\|> clean_parsed_data()`
			`\|> check_parsed_data()`
rich media: add try/rescue to ensure we catch parsing and fetching failures 2019-01-27 05:21:05 -07:00			`end`
Add RichMediaController and tests 2019-01-02 07:02:50 -07:00			`end`

			`defp maybe_parse(html) do`
parsers configurable 2019-07-11 07:04:42 -06:00			`Enum.reduce_while(parsers(), %{}, fn parser, acc ->`
Add OGP parser 2019-01-01 13:26:40 -07:00			`case parser.parse(html, acc) do`
Merge OGP parser with TwitterCard 2020-06-11 07:57:31 -06:00			`data when data != %{} -> {:halt, data}`
			`_ -> {:cont, acc}`
Add OGP parser 2019-01-01 13:26:40 -07:00			`end`
			`end)`
			`end`
Add RichMediaController and tests 2019-01-02 07:02:50 -07:00
Fix atom leak in Rich Media Parser 2020-06-09 11:49:24 -06:00			`defp check_parsed_data(%{"title" => title} = data)`
			`when is_binary(title) and title != "" do`
rich media: parser: add some basic sanity checks on the returned data with pattern matching 2019-01-28 13:31:43 -07:00			`{:ok, data}`
Add RichMediaController and tests 2019-01-02 07:02:50 -07:00			`end`

rich media: parser: reject OGP fields we cannot safely process 2019-01-31 09:03:56 -07:00			`defp check_parsed_data(data) do`
RichMedia: do not log webpages missing metadata as errors Also fixes the return value of Parser.parse on errors, previously was just `:ok` due to the logger call in the end 2020-09-05 13:00:51 -06:00			`{:error, {:invalid_metadata, data}}`
Add RichMediaController and tests 2019-01-02 07:02:50 -07:00			`end`
rich media: parser: reject OGP fields we cannot safely process 2019-01-31 09:03:56 -07:00
			`defp clean_parsed_data(data) do`
			`data`
rich media: parser: reject any data which cannot be explicitly encoded into JSON 2019-02-05 13:50:57 -07:00			`\|> Enum.reject(fn {key, val} ->`
Fix atom leak in Rich Media Parser 2020-06-09 11:49:24 -06:00			`not match?({:ok, _}, Jason.encode(%{key => val}))`
rich media: parser: reject OGP fields we cannot safely process 2019-01-31 09:03:56 -07:00			`end)`
			`\|> Map.new()`
			`end`
Fix tests 2024-06-09 11:28:00 -06:00
			`@spec validate_page_url(URI.t() \| binary()) :: :ok \| :error`
			`defp validate_page_url(page_url) when is_binary(page_url) do`
			`validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld])`

			`page_url`
			`\|> Linkify.Parser.url?(validate_tld: validate_tld)`
			`\|> parse_uri(page_url)`
			`end`

			`defp validate_page_url(%URI{host: host, scheme: "https"}) do`
			`cond do`
			`Linkify.Parser.ip?(host) ->`
			`:error`

			`host in @config_impl.get([:rich_media, :ignore_hosts], []) ->`
			`:error`

			`get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) ->`
			`:error`

			`true ->`
			`:ok`
			`end`
			`end`

			`defp validate_page_url(_), do: :error`

			`defp parse_uri(true, url) do`
			`url`
			`\|> URI.parse()`
			`\|> validate_page_url`
			`end`

			`defp parse_uri(_, _), do: :error`

			`defp get_tld(host) do`
			`host`
			`\|> String.split(".")`
			`\|> Enum.reverse()`
			`\|> hd`
			`end`
Add OGP parser 2019-01-01 13:26:40 -07:00			`end`