From e55d5c0a109edd55654a34e8ee8dc8fe189d0997 Mon Sep 17 00:00:00 2001 From: ACBBZ Date: Tue, 19 Aug 2025 14:19:20 +0800 Subject: [PATCH] feat: update pdf upsert --- src/blackbox/chroma_upsert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blackbox/chroma_upsert.py b/src/blackbox/chroma_upsert.py index fda0e14..98b13f9 100755 --- a/src/blackbox/chroma_upsert.py +++ b/src/blackbox/chroma_upsert.py @@ -8,7 +8,7 @@ import requests import json from langchain_community.document_loaders.csv_loader import CSVLoader -from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader, UnstructuredHTMLLoader, JSONLoader, Docx2txtLoader, UnstructuredExcelLoader, UnstructuredPDFLoader +from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader, UnstructuredHTMLLoader, JSONLoader, Docx2txtLoader, UnstructuredExcelLoader, PyPDFLoader from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings @@ -96,7 +96,7 @@ class ChromaUpsert(Blackbox): file_type = file.split(".")[-1] print("file_type: ",file_type) if file_type == "pdf": - loader = UnstructuredPDFLoader(file) + loader = PyPDFLoader(file) elif file_type == "txt": loader = TextLoader(file) elif file_type == "csv":