Semantic Index (#10329)

This introduces semantic indexing in Zed based on chunking text from
files in the developer's workspace and creating vector embeddings using
an embedding model. As part of this, we've created an embeddings
provider trait that allows us to work with OpenAI, a local Ollama model,
or a Zed hosted embedding.

The semantic index is built by breaking down text for known
(programming) languages into manageable chunks that are smaller than the
max token size. Each chunk is then fed to a language model to create a
high dimensional vector which is then normalized to a unit vector to
allow fast comparison with other vectors with a simple dot product.
Alongside the vector, we store the path of the file and the range within
the document where the vector was sourced from.

Zed will soon grok contextual similarity across different text snippets,
allowing for natural language search beyond keyword matching. This is
being put together both for human-based search as well as providing
results to Large Language Models to allow them to refine how they help
developers.

Remaining todo:

* [x] Change `provider` to `model` within the zed hosted embeddings
database (as its currently a combo of the provider and the model in one
name)


Release Notes:

- N/A

---------

Co-authored-by: Nathan Sobo <nathan@zed.dev>
Co-authored-by: Antonio Scandurra <me@as-cii.com>
Co-authored-by: Conrad Irwin <conrad@zed.dev>
Co-authored-by: Marshall Bowers <elliott.codes@gmail.com>
Co-authored-by: Antonio <antonio@zed.dev>
This commit is contained in:
Kyle Kelley 2024-04-12 10:40:59 -07:00 committed by GitHub
parent 4b40e83b8b
commit 49371b44cb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 2649 additions and 41 deletions

View file

@ -204,6 +204,11 @@ message Envelope {
LanguageModelResponse language_model_response = 167;
CountTokensWithLanguageModel count_tokens_with_language_model = 168;
CountTokensResponse count_tokens_response = 169;
GetCachedEmbeddings get_cached_embeddings = 189;
GetCachedEmbeddingsResponse get_cached_embeddings_response = 190;
ComputeEmbeddings compute_embeddings = 191;
ComputeEmbeddingsResponse compute_embeddings_response = 192; // current max
UpdateChannelMessage update_channel_message = 170;
ChannelMessageUpdate channel_message_update = 171;
@ -216,7 +221,7 @@ message Envelope {
MultiLspQueryResponse multi_lsp_query_response = 176;
CreateRemoteProject create_remote_project = 177;
CreateRemoteProjectResponse create_remote_project_response = 188; // current max
CreateRemoteProjectResponse create_remote_project_response = 188;
CreateDevServer create_dev_server = 178;
CreateDevServerResponse create_dev_server_response = 179;
ShutdownDevServer shutdown_dev_server = 180;
@ -1892,6 +1897,29 @@ message CountTokensResponse {
uint32 token_count = 1;
}
message GetCachedEmbeddings {
string model = 1;
repeated bytes digests = 2;
}
message GetCachedEmbeddingsResponse {
repeated Embedding embeddings = 1;
}
message ComputeEmbeddings {
string model = 1;
repeated string texts = 2;
}
message ComputeEmbeddingsResponse {
repeated Embedding embeddings = 1;
}
message Embedding {
bytes digest = 1;
repeated float dimensions = 2;
}
message BlameBuffer {
uint64 project_id = 1;
uint64 buffer_id = 2;

View file

@ -151,6 +151,8 @@ messages!(
(ChannelMessageSent, Foreground),
(ChannelMessageUpdate, Foreground),
(CompleteWithLanguageModel, Background),
(ComputeEmbeddings, Background),
(ComputeEmbeddingsResponse, Background),
(CopyProjectEntry, Foreground),
(CountTokensWithLanguageModel, Background),
(CountTokensResponse, Background),
@ -174,6 +176,8 @@ messages!(
(FormatBuffers, Foreground),
(FormatBuffersResponse, Foreground),
(FuzzySearchUsers, Foreground),
(GetCachedEmbeddings, Background),
(GetCachedEmbeddingsResponse, Background),
(GetChannelMembers, Foreground),
(GetChannelMembersResponse, Foreground),
(GetChannelMessages, Background),
@ -325,6 +329,7 @@ request_messages!(
(CancelCall, Ack),
(CopyProjectEntry, ProjectEntryResponse),
(CompleteWithLanguageModel, LanguageModelResponse),
(ComputeEmbeddings, ComputeEmbeddingsResponse),
(CountTokensWithLanguageModel, CountTokensResponse),
(CreateChannel, CreateChannelResponse),
(CreateProjectEntry, ProjectEntryResponse),
@ -336,6 +341,7 @@ request_messages!(
(Follow, FollowResponse),
(FormatBuffers, FormatBuffersResponse),
(FuzzySearchUsers, UsersResponse),
(GetCachedEmbeddings, GetCachedEmbeddingsResponse),
(GetChannelMembers, GetChannelMembersResponse),
(GetChannelMessages, GetChannelMessagesResponse),
(GetChannelMessagesById, GetChannelMessagesResponse),