mirror of
https://github.com/langgenius/dify.git
synced 2026-05-04 09:28:04 +08:00
update text spliter
This commit is contained in:
@ -1390,7 +1390,7 @@ class SegmentService:
|
||||
model=dataset.embedding_model,
|
||||
)
|
||||
# calc embedding use tokens
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0]
|
||||
lock_name = "add_segment_lock_document_id_{}".format(document.id)
|
||||
with redis_client.lock(lock_name, timeout=600):
|
||||
max_position = (
|
||||
@ -1467,9 +1467,9 @@ class SegmentService:
|
||||
if dataset.indexing_technique == "high_quality" and embedding_model:
|
||||
# calc embedding use tokens
|
||||
if document.doc_form == "qa_model":
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content + segment_item["answer"]])
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content + segment_item["answer"]])[0]
|
||||
else:
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0]
|
||||
segment_document = DocumentSegment(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
dataset_id=document.dataset_id,
|
||||
@ -1577,9 +1577,9 @@ class SegmentService:
|
||||
|
||||
# calc embedding use tokens
|
||||
if document.doc_form == "qa_model":
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content + segment.answer])
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content + segment.answer])[0]
|
||||
else:
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0]
|
||||
segment.content = content
|
||||
segment.index_node_hash = segment_hash
|
||||
segment.word_count = len(content)
|
||||
|
||||
Reference in New Issue
Block a user