Skip to content

Add Typesense #44

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
08e7b64
add typesense
ruslandoga Nov 13, 2024
b59abed
fix ci
ruslandoga Sep 27, 2024
79edf7d
Update lib/hexdocs/queue.ex
ruslandoga Oct 2, 2024
67ae461
add compose
ruslandoga Oct 3, 2024
4eecb02
hide typesense behind impl
ruslandoga Oct 3, 2024
1c727da
simplify find_search_items/3
ruslandoga Oct 3, 2024
2ed6007
cleanup test
ruslandoga Oct 3, 2024
f0a39a0
include typesense in ci tests
ruslandoga Oct 21, 2024
649397a
add post in hexdocs.http
ruslandoga Oct 21, 2024
6333e8c
read typesense indexing response eagerly
ruslandoga Oct 21, 2024
e95f897
make collection configurable
ruslandoga Oct 21, 2024
2862552
remove content-type: text/plain from import
ruslandoga Oct 21, 2024
25938fb
update typesense to 27.1
ruslandoga Oct 21, 2024
7ae762a
do typesense indexing last
ruslandoga Oct 21, 2024
f436ed7
extract indexing to separate function, wrap in log lines
ruslandoga Oct 21, 2024
7ce5be5
refactor
ruslandoga Nov 10, 2024
f70d9b4
handle more errors
ruslandoga Nov 10, 2024
64481ed
add proglang to collection
ruslandoga Nov 10, 2024
a29793d
refactor tests a bit
ruslandoga Nov 10, 2024
460bc7a
test find_search_items/3
ruslandoga Nov 10, 2024
a70bc4b
test package delete
ruslandoga Nov 10, 2024
321b806
refactor log tests
ruslandoga Nov 10, 2024
58e923d
add bad document test
ruslandoga Nov 10, 2024
d7cb3b9
remove search_data_js format error logs when there is no search_data_js
ruslandoga Nov 10, 2024
dd222ea
rm releases.exs
ruslandoga Nov 13, 2024
bfc0539
read Typesense collection from env
ruslandoga Nov 13, 2024
1a97386
switch to :json
ruslandoga Nov 13, 2024
1087e6a
update tests
ruslandoga Nov 13, 2024
8d58e4f
add tests for invalid search items
ruslandoga Nov 13, 2024
9a839e4
Merge branch 'main' into add-typesense
wojtekmach Dec 11, 2024
a4ba5c3
Prepare hexdocs staging for typesense
wojtekmach Dec 11, 2024
26169bb
wip
wojtekmach Dec 11, 2024
ff16251
Revert "wip"
wojtekmach Dec 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ jobs:
- name: Check mix format
run: mix format --check-formatted

- name: Start Typesense
run: docker compose up -d typesense

- name: Run tests
run: |
mix test
mix test --include typesense
6 changes: 6 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
services:
typesense:
image: typesense/typesense:27.1
command: --data-dir /tmp --api-key=hexdocs
ports:
- 8108:8108
4 changes: 4 additions & 0 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@ config :hexdocs,
port: "4002",
hexpm_url: "http://localhost:4000",
hexpm_secret: "2cd6d09334d4b00a2be4d532342b799b",
typesense_url: "http://localhost:8108",
typesense_api_key: "hexdocs",
typesense_collection: "hexdocs",
hexpm_impl: Hexdocs.Hexpm.Impl,
store_impl: Hexdocs.Store.Local,
cdn_impl: Hexdocs.CDN.Local,
search_impl: Hexdocs.Search.Local,
source_repo_impl: Hexdocs.SourceRepo.GitHub,
tmp_dir: "tmp",
queue_id: "test",
Expand Down
3 changes: 2 additions & 1 deletion config/dev.exs
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ config :hexdocs,
hexpm_url: "http://localhost:4000",
hexpm_impl: Hexdocs.Hexpm.Impl,
store_impl: Hexdocs.Store.Local,
cdn_impl: Hexdocs.CDN.Local
cdn_impl: Hexdocs.CDN.Local,
search_impl: Hexdocs.Search.Local
1 change: 1 addition & 0 deletions config/prod.exs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ config :hexdocs,
hexpm_impl: Hexdocs.Hexpm.Impl,
store_impl: Hexdocs.Store.Impl,
cdn_impl: Hexdocs.CDN.Fastly,
search_impl: Hexdocs.Search.Typesense,
queue_producer: BroadwaySQS.Producer,
gcs_put_debounce: 3000

Expand Down
3 changes: 3 additions & 0 deletions config/runtime.exs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ if config_env() == :prod do
port: System.fetch_env!("HEXDOCS_PORT"),
hexpm_url: System.fetch_env!("HEXDOCS_HEXPM_URL"),
hexpm_secret: System.fetch_env!("HEXDOCS_HEXPM_SECRET"),
typesense_url: System.fetch_env!("HEXDOCS_TYPESENSE_URL"),
typesense_api_key: System.fetch_env!("HEXDOCS_TYPESENSE_API_KEY"),
typesense_collection: System.fetch_env!("HEXDOCS_TYPESENSE_COLLECTION"),
fastly_key: System.fetch_env!("HEXDOCS_FASTLY_KEY"),
fastly_hexdocs: System.fetch_env!("HEXDOCS_FASTLY_HEXDOCS"),
queue_id: System.fetch_env!("HEXDOCS_QUEUE_ID"),
Expand Down
1 change: 1 addition & 0 deletions config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ config :hexdocs,
hexpm_impl: Hexdocs.HexpmMock,
store_impl: Hexdocs.Store.Local,
cdn_impl: Hexdocs.CDN.Local,
search_impl: Hexdocs.Search.Local,
source_repo_impl: Hexdocs.SourceRepo.Mock

config :logger, level: :warning
4 changes: 4 additions & 0 deletions lib/hexdocs/http.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ defmodule Hexdocs.HTTP do
|> read_response()
end

def post(url, headers, body, opts \\ []) do
:hackney.post(url, headers, body, opts)
end

def delete(url, headers) do
:hackney.delete(url, headers)
|> read_response()
Expand Down
18 changes: 17 additions & 1 deletion lib/hexdocs/queue.ex
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,12 @@ defmodule Hexdocs.Queue do
update_package_sitemap(repository, key, package, files)
end

if repository == "hexpm" do
update_search_index(key, package, version, files)
end

elapsed = System.os_time(:millisecond) - start
Logger.info("FINISHED UPLOADING DOCS #{key} #{elapsed}ms")
Logger.info("FINISHED UPLOADING AND INDEXING DOCS #{key} #{elapsed}ms")

{:error, reason} ->
Logger.error("Failed unpack #{repository}/#{package} #{version}: #{reason}")
Expand All @@ -149,6 +153,10 @@ defmodule Hexdocs.Queue do
Hexdocs.Bucket.delete(repository, package, version, all_versions)
update_index_sitemap(repository, key)

if repository == "hexpm" do
Hexdocs.Search.delete(package, version)
end

elapsed = System.os_time(:millisecond) - start
Logger.info("FINISHED DELETING DOCS #{key} #{elapsed}ms")
:ok
Expand Down Expand Up @@ -228,6 +236,14 @@ defmodule Hexdocs.Queue do
:ok
end

defp update_search_index(key, package, version, files) do
with {proglang, items} <- Hexdocs.Search.find_search_items(package, version, files) do
Logger.info("UPDATING SEARCH INDEX #{key}")
Hexdocs.Search.index(package, version, proglang, items)
Logger.info("UPDATED SEARCH INDEX #{key}")
end
end

@doc false
def paths_for_sitemaps() do
key_regex = ~r"docs/(.*)-(.*).tar.gz$"
Expand Down
9 changes: 9 additions & 0 deletions lib/hexdocs/search/local.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defmodule Hexdocs.Search.Local do
@behaviour Hexdocs.Search

@impl true
def index(_package, _version, _proglang, _items), do: :ok

@impl true
def delete(_package, _version), do: :ok
end
93 changes: 93 additions & 0 deletions lib/hexdocs/search/search.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
defmodule Hexdocs.Search do
require Logger

@type package :: String.t()
@type version :: Version.t()
@type proglang :: String.t()
@type search_items :: [map]

@callback index(package, version, proglang, search_items) :: :ok
@callback delete(package, version) :: :ok

defp impl, do: Application.fetch_env!(:hexdocs, :search_impl)

@spec index(package, version, proglang, search_items) :: :ok
def index(package, version, proglang, search_items) do
impl().index(package, version, proglang, search_items)
end

@spec delete(package, version) :: :ok
def delete(package, version) do
impl().delete(package, version)
end

@spec find_search_items(package, version, [{Path.t(), content :: iodata}]) ::
{proglang, search_items} | nil
def find_search_items(package, version, files) do
search_data_js =
Enum.find_value(files, fn {path, content} ->
case Path.basename(path) do
"search_data-" <> _digest -> content
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot trust this data since it's user provided, can they do anything dangerous by providing something we don't expect? Maybe we should do some rudimentary validation?

Copy link
Contributor Author

@ruslandoga ruslandoga Nov 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They can provide long strings like https://github.com/cloudpods-dev/docker-engine-api-elixir/blob/813cc557da483f623a8f484db04efc7e58db0376/lib/docker_engine_api/api/container.ex#L67, but Typesense seems to handle it fine. We can check for content size, maybe. I think if Typesense doesn't like the payload, it would simply reject it.

Copy link
Contributor Author

@ruslandoga ruslandoga Nov 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a test that checks that invalid fields in search items (like type being a map instead of a string, or doc being a list) are rejected: 8d58e4f

_other -> nil
end
end)

unless search_data_js do
Logger.info("Failed to find search data for #{package} #{version}")
end

search_data_json =
case search_data_js do
"searchData=" <> json ->
json

_ when is_binary(search_data_js) ->
Logger.error("Unexpected search_data format for #{package} #{version}")
nil

nil ->
nil
end

search_data =
if search_data_json do
try do
:json.decode(search_data_json)
catch
_kind, reason ->
Logger.error(
"Failed to decode search data json for #{package} #{version}: " <>
inspect(reason)
)

nil
end
end

case search_data do
%{"items" => [_ | _] = search_items} ->
proglang = Map.get(search_data, "proglang") || proglang(search_items)
{proglang, search_items}

nil ->
nil

_ ->
Logger.error(
"Failed to extract search items and proglang from search data for #{package} #{version}"
)

nil
end
end

defp proglang(search_items) do
if Enum.any?(search_items, &elixir_module?/1), do: "elixir", else: "erlang"
end

defp elixir_module?(%{"type" => "module", "title" => <<first_letter, _::binary>>})
when first_letter in ?A..?Z,
do: true

defp elixir_module?(_), do: false
end
105 changes: 105 additions & 0 deletions lib/hexdocs/search/typesense.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
defmodule Hexdocs.Search.Typesense do
@moduledoc false
require Logger
alias Hexdocs.HTTP

@behaviour Hexdocs.Search

@impl true
def index(package, version, proglang, search_items) do
full_package = full_package(package, version)

ndjson =
Enum.map(search_items, fn item ->
json =
Map.take(item, ["type", "ref", "title", "doc"])
|> Map.put("package", full_package)
|> Map.put("proglang", proglang)
|> :json.encode()

[json, ?\n]
end)

url = url("collections/#{collection()}/documents/import?action=create")
headers = [{"x-typesense-api-key", api_key()}]

case HTTP.post(url, headers, ndjson, [:with_body]) do
{:ok, 200, _resp_headers, ndjson} ->
ndjson
|> String.split("\n")
|> Enum.each(fn json ->
case :json.decode(json) do
%{"success" => true} ->
:ok

%{"success" => false, "error" => error, "document" => document} ->
Logger.error(
"Failed to index search item for #{package} #{version} for document #{inspect(document)}: #{inspect(error)}"
)
end
end)

{:ok, status, _resp_headers, _body} ->
Logger.error("Failed to index search items for #{package} #{version}: status=#{status}")

{:error, reason} ->
Logger.error("Failed to index search items #{package} #{version}: #{inspect(reason)}")
end
end

@impl true
def delete(package, version) do
full_package = full_package(package, version)

query = URI.encode_query([{"filter_by", "package:#{full_package}"}])
url = url("collections/#{collection()}/documents?" <> query)
headers = [{"x-typesense-api-key", api_key()}]

case HTTP.delete(url, headers) do
{:ok, 200, _resp_headers, _body} ->
:ok

{:ok, status, _resp_headers, _body} ->
Logger.error("Failed to delete search items for #{package} #{version}: status=#{status}")

{:error, reason} ->
Logger.error(
"Failed to delete search items for #{package} #{version}: #{inspect(reason)}"
)
end
end

@spec collection :: String.t()
def collection do
Application.fetch_env!(:hexdocs, :typesense_collection)
end

@spec collection_schema :: map
def collection_schema(collection \\ collection()) do
%{
"fields" => [
%{"facet" => true, "name" => "proglang", "type" => "string"},
%{"facet" => true, "name" => "type", "type" => "string"},
%{"name" => "title", "type" => "string"},
%{"name" => "doc", "type" => "string"},
%{"facet" => true, "name" => "package", "type" => "string"}
],
"name" => collection,
"token_separators" => [".", "_", "-", " ", ":", "@", "/"]
}
end

@spec api_key :: String.t()
def api_key do
Application.fetch_env!(:hexdocs, :typesense_api_key)
end

defp full_package(package, version) do
"#{package}-#{version}"
end

defp url(path) do
base_url = Application.fetch_env!(:hexdocs, :typesense_url)
Path.join(base_url, path)
end
end
Loading
Loading