Merge branch 'main' into feat/workflow

# Conflicts:
#	docker/docker-compose.yaml
This commit is contained in:
takatost
2024-03-29 21:18:16 +08:00
40 changed files with 876 additions and 141 deletions

View File

@ -144,6 +144,16 @@ class MilvusVector(BaseVector):
utility.drop_collection(self._collection_name, None, using=alias)
def text_exists(self, id: str) -> bool:
alias = uuid4().hex
if self._client_config.secure:
uri = "https://" + str(self._client_config.host) + ":" + str(self._client_config.port)
else:
uri = "http://" + str(self._client_config.host) + ":" + str(self._client_config.port)
connections.connect(alias=alias, uri=uri, user=self._client_config.user, password=self._client_config.password)
from pymilvus import utility
if not utility.has_collection(self._collection_name, using=alias):
return False
result = self._client.query(collection_name=self._collection_name,
filter=f'metadata["doc_id"] == "{id}"',

View File

@ -275,6 +275,13 @@ class QdrantVector(BaseVector):
)
def text_exists(self, id: str) -> bool:
all_collection_name = []
collections_response = self._client.get_collections()
collection_list = collections_response.collections
for collection in collection_list:
all_collection_name.append(collection.name)
if self._collection_name not in all_collection_name:
return False
response = self._client.retrieve(
collection_name=self._collection_name,
ids=[id]

View File

@ -128,8 +128,8 @@ class Vector:
if kwargs.get('duplicate_check', False):
documents = self._filter_duplicate_texts(documents)
embeddings = self._embeddings.embed_documents([document.page_content for document in documents])
self._vector_processor.add_texts(
documents=documents,
self._vector_processor.create(
texts=documents,
embeddings=embeddings,
**kwargs
)

View File

@ -134,6 +134,11 @@ class WeaviateVector(BaseVector):
def text_exists(self, id: str) -> bool:
collection_name = self._collection_name
schema = self._default_schema(self._collection_name)
# check whether the index already exists
if not self._client.schema.contains(schema):
return False
result = self._client.query.get(collection_name).with_additional(["id"]).with_where({
"path": ["doc_id"],
"operator": "Equal",

View File

@ -53,7 +53,7 @@ class UnstructuredWordExtractor(BaseExtractor):
elements = partition_docx(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -43,7 +43,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
pass
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -38,7 +38,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
elements = partition_md(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -28,7 +28,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
elements = partition_msg(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -28,7 +28,7 @@ class UnstructuredTextExtractor(BaseExtractor):
elements = partition_text(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -28,7 +28,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()