added token count to documents during parsing
This commit is contained in:
parent
a7e6a65deb
commit
e377ada1a9
4 changed files with 54 additions and 12 deletions
|
@ -54,6 +54,8 @@ struct OpenAIEmbeddingUsage {
|
|||
#[async_trait]
|
||||
pub trait EmbeddingProvider: Sync + Send {
|
||||
async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>>;
|
||||
fn count_tokens(&self, span: &str) -> usize;
|
||||
// fn truncate(&self, span: &str) -> Result<&str>;
|
||||
}
|
||||
|
||||
pub struct DummyEmbeddings {}
|
||||
|
@ -66,6 +68,12 @@ impl EmbeddingProvider for DummyEmbeddings {
|
|||
let dummy_vec = vec![0.32 as f32; 1536];
|
||||
return Ok(vec![dummy_vec; spans.len()]);
|
||||
}
|
||||
|
||||
fn count_tokens(&self, span: &str) -> usize {
|
||||
// For Dummy Providers, we are going to use OpenAI tokenization for ease
|
||||
let tokens = OPENAI_BPE_TOKENIZER.encode_with_special_tokens(span);
|
||||
tokens.len()
|
||||
}
|
||||
}
|
||||
|
||||
const OPENAI_INPUT_LIMIT: usize = 8190;
|
||||
|
@ -111,6 +119,12 @@ impl OpenAIEmbeddings {
|
|||
|
||||
#[async_trait]
|
||||
impl EmbeddingProvider for OpenAIEmbeddings {
|
||||
fn count_tokens(&self, span: &str) -> usize {
|
||||
// For Dummy Providers, we are going to use OpenAI tokenization for ease
|
||||
let tokens = OPENAI_BPE_TOKENIZER.encode_with_special_tokens(span);
|
||||
tokens.len()
|
||||
}
|
||||
|
||||
async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>> {
|
||||
const BACKOFF_SECONDS: [usize; 4] = [3, 5, 15, 45];
|
||||
const MAX_RETRIES: usize = 4;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue