feat: update pdf upsert

This commit is contained in:
ACBBZ
2025-08-19 14:19:20 +08:00
parent 85a2c21b69
commit e55d5c0a10

View File

@ -8,7 +8,7 @@ import requests
import json import json
from langchain_community.document_loaders.csv_loader import CSVLoader from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader, UnstructuredHTMLLoader, JSONLoader, Docx2txtLoader, UnstructuredExcelLoader, UnstructuredPDFLoader from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader, UnstructuredHTMLLoader, JSONLoader, Docx2txtLoader, UnstructuredExcelLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
@ -96,7 +96,7 @@ class ChromaUpsert(Blackbox):
file_type = file.split(".")[-1] file_type = file.split(".")[-1]
print("file_type: ",file_type) print("file_type: ",file_type)
if file_type == "pdf": if file_type == "pdf":
loader = UnstructuredPDFLoader(file) loader = PyPDFLoader(file)
elif file_type == "txt": elif file_type == "txt":
loader = TextLoader(file) loader = TextLoader(file)
elif file_type == "csv": elif file_type == "csv":