Add documentation for ES search

This commit is contained in:
FloatingGhost 2022-06-30 17:36:57 +01:00
parent 635a3c223a
commit bc9e76cce7
4 changed files with 278 additions and 0 deletions

View file

@ -3472,5 +3472,90 @@ config :pleroma, :config_description, [
suggestion: [100_000] suggestion: [100_000]
} }
] ]
},
%{
group: :pleroma,
key: Pleroma.Search.Elasticsearch.Cluster,
type: :group,
description: "Elasticsearch settings.",
children: [
%{
key: :url,
type: :string,
description: "Elasticsearch URL.",
suggestion: ["http://127.0.0.1:9200/"]
},
%{
key: :username,
type: :string,
description: "Username to connect to ES. Set to nil if your cluster is unauthenticated.",
suggestion: ["elastic"]
},
%{
key: :password,
type: :string,
description: "Password to connect to ES. Set to nil if your cluster is unauthenticated.",
suggestion: ["changeme"]
},
%{
key: :api,
type: :module,
description:
"The API module used by Elasticsearch. Should always be Elasticsearch.API.HTTP",
suggestion: [Elasticsearch.API.HTTP]
},
%{
key: :json_library,
type: :module,
description:
"The JSON module used to encode/decode when communicating with Elasticsearch",
suggestion: [Jason]
},
%{
key: :indexes,
type: :map,
description: "The indices to set up in Elasticsearch",
children: [
%{
key: :activities,
type: :map,
description: "Config for the index to use for activities",
children: [
%{
key: :settings,
type: :string,
description:
"Path to the file containing index settings for the activities index. Should contain a mapping.",
suggestion: ["priv/es-mappings/activity.json"]
},
%{
key: :store,
type: :module,
description: "The internal store module",
suggestion: [Pleroma.Search.Elasticsearch.Store]
},
%{
key: :sources,
type: {:list, :module},
description: "The internal types to use for this index",
suggestion: [[Pleroma.Activity]]
},
%{
key: :bulk_page_size,
type: :int,
description: "Size for bulk put requests, mostly used on building the index",
suggestion: [5000]
},
%{
key: :bulk_wait_interval,
type: :int,
description: "Time to wait between bulk put requests (in ms)",
suggestion: [15_000]
}
]
}
]
}
]
} }
] ]

View file

@ -121,3 +121,43 @@ This will clear **all** the posts from the search index. Note, that deleted post
there is no need to actually clear the whole index, unless you want **all** of it gone. That said, the index does not hold any information there is no need to actually clear the whole index, unless you want **all** of it gone. That said, the index does not hold any information
that cannot be re-created from the database, it should also generally be a lot smaller than the size of your database. Still, the size that cannot be re-created from the database, it should also generally be a lot smaller than the size of your database. Still, the size
depends on the amount of text in posts. depends on the amount of text in posts.
## Elasticsearch
As with meilisearch, this can be rather memory-hungry, but it is very good at what it does.
To use [elasticsearch](https://www.elastic.co/), set the search module to `Pleroma.Search.Elasticsearch`:
> config :pleroma, Pleroma.Search, module: Pleroma.Search.Elasticsearch
You then need to set the URL and authentication credentials if relevant.
> config :pleroma, Pleroma.Search.Elasticsearch.Cluster,
> url: "http://127.0.0.1:9200/",
> username: "elastic",
> password: "changeme",
### Initial indexing
After setting up the configuration, you'll want to index all of your already existsing posts. Only public posts are indexed. You'll only
have to do it one time, but it might take a while, depending on the amount of posts your instance has seen.
The sequence of actions is as follows:
1. First, change the configuration to use `Pleroma.Search.Elasticsearch` as the search backend
2. Restart your instance, at this point it can be used while the search indexing is running, though search won't return anything
3. Start the initial indexing process (as described below with `index`),
and wait until the task says it sent everything from the database to index
4. Wait until the index tasks exits
To start the initial indexing, run the `build` command:
=== "OTP"
```sh
./bin/pleroma_ctl search.elasticsearch index activities --cluster Pleroma.Search.Elasticsearch.Cluster
```
=== "From Source"
```sh
mix elasticsearch.build activities --cluster Pleroma.Search.Elasticsearch.Cluster
```

View file

@ -0,0 +1,9 @@
defmodule Mix.Tasks.Pleroma.Search.Elasticsearch do
alias Mix.Tasks.Elasticsearch.Build
import Mix.Pleroma
def run(["index" | args]) do
start_pleroma()
Build.run(args)
end
end

View file

@ -0,0 +1,144 @@
# Pleroma: A lightweight social networking server
# Copyright © 2017-2021 Pleroma Authors <https://pleroma.social/>
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Mix.Tasks.Pleroma.Search.Meilisearch do
require Pleroma.Constants
import Mix.Pleroma
import Ecto.Query
import Pleroma.Search.Meilisearch,
only: [meili_post: 2, meili_put: 2, meili_get: 1, meili_delete!: 1]
def run(["index"]) do
start_pleroma()
meili_version =
(
{:ok, result} = meili_get("/version")
result["pkgVersion"]
)
# The ranking rule syntax was changed but nothing about that is mentioned in the changelog
if not Version.match?(meili_version, ">= 0.25.0") do
raise "Meilisearch <0.24.0 not supported"
end
{:ok, _} =
meili_post(
"/indexes/objects/settings/ranking-rules",
[
"published:desc",
"words",
"exactness",
"proximity",
"typo",
"attribute",
"sort"
]
)
{:ok, _} =
meili_post(
"/indexes/objects/settings/searchable-attributes",
[
"content"
]
)
IO.puts("Created indices. Starting to insert posts.")
chunk_size = Pleroma.Config.get([Pleroma.Search.Meilisearch, :initial_indexing_chunk_size])
Pleroma.Repo.transaction(
fn ->
query =
from(Pleroma.Object,
# Only index public and unlisted posts which are notes and have some text
where:
fragment("data->>'type' = 'Note'") and
(fragment("data->'to' \\? ?", ^Pleroma.Constants.as_public()) or
fragment("data->'cc' \\? ?", ^Pleroma.Constants.as_public())),
order_by: [desc: fragment("data->'published'")]
)
count = query |> Pleroma.Repo.aggregate(:count, :data)
IO.puts("Entries to index: #{count}")
Pleroma.Repo.stream(
query,
timeout: :infinity
)
|> Stream.map(&Pleroma.Search.Meilisearch.object_to_search_data/1)
|> Stream.filter(fn o -> not is_nil(o) end)
|> Stream.chunk_every(chunk_size)
|> Stream.transform(0, fn objects, acc ->
new_acc = acc + Enum.count(objects)
# Reset to the beginning of the line and rewrite it
IO.write("\r")
IO.write("Indexed #{new_acc} entries")
{[objects], new_acc}
end)
|> Stream.each(fn objects ->
result =
meili_put(
"/indexes/objects/documents",
objects
)
with {:ok, res} <- result do
if not Map.has_key?(res, "uid") do
IO.puts("\nFailed to index: #{inspect(result)}")
end
else
e -> IO.puts("\nFailed to index due to network error: #{inspect(e)}")
end
end)
|> Stream.run()
end,
timeout: :infinity
)
IO.write("\n")
end
def run(["clear"]) do
start_pleroma()
meili_delete!("/indexes/objects/documents")
end
def run(["show-keys", master_key]) do
start_pleroma()
endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url])
{:ok, result} =
Pleroma.HTTP.get(
Path.join(endpoint, "/keys"),
[{"Authorization", "Bearer #{master_key}"}]
)
decoded = Jason.decode!(result.body)
if decoded["results"] do
Enum.each(decoded["results"], fn %{"description" => desc, "key" => key} ->
IO.puts("#{desc}: #{key}")
end)
else
IO.puts("Error fetching the keys, check the master key is correct: #{inspect(decoded)}")
end
end
def run(["stats"]) do
start_pleroma()
{:ok, result} = meili_get("/indexes/objects/stats")
IO.puts("Number of entries: #{result["numberOfDocuments"]}")
IO.puts("Indexing? #{result["isIndexing"]}")
end
end