Created the basic pdf retriever program
This commit is contained in:
parent
2dd8a9b533
commit
50983e9469
66
main.py
Normal file
66
main.py
Normal file
@ -0,0 +1,66 @@
|
||||
import os
|
||||
import streamlit as st
|
||||
from PyPDF2 import PdfReader
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY')
|
||||
|
||||
|
||||
|
||||
|
||||
def extractText(uploaded_file):
|
||||
file = PdfReader(uploaded_file)
|
||||
text_data = ''
|
||||
for pg in file.pages:
|
||||
text_data += pg.extract_text()
|
||||
return text_data
|
||||
|
||||
def chunkText(pdf_text):
|
||||
text_chunker = CharacterTextSplitter(
|
||||
separator="\n",
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200,
|
||||
length_function=len,
|
||||
is_separator_regex=False,
|
||||
)
|
||||
text_sections = text_chunker.create_documents([pdf_text])
|
||||
return text_sections
|
||||
|
||||
|
||||
|
||||
def app():
|
||||
st.set_page_config(page_title='AI reads your PDFs')
|
||||
st.title("AI reads your PDFs")
|
||||
|
||||
file = st.file_uploader('Upload your PDF file', type='pdf')
|
||||
if file is not None:
|
||||
# Extract Text from the document, split it into chunks, embed each chunk and load it into the vector store.
|
||||
text = extractText(file)
|
||||
text_sections = chunkText(text)
|
||||
query = st.text_input("Enter your query")
|
||||
# # Query using vector embeddings
|
||||
# db = Chroma.from_documents(text_sections, OpenAIEmbeddings())
|
||||
# embedding_vector = OpenAIEmbeddings().embed_query(query)
|
||||
# docs = db.similarity_search_by_vector(embedding_vector)
|
||||
if st.button('retrieve from PDF'):
|
||||
retriever = FAISS.from_documents(text_sections, OpenAIEmbeddings()).as_retriever()
|
||||
docs = retriever.get_relevant_documents(
|
||||
query
|
||||
)
|
||||
# Print for db embeddings
|
||||
st.write(docs[0].page_content)
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
app()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@ -0,0 +1,6 @@
|
||||
langchain==0.1.6
|
||||
langchain_community==0.0.19
|
||||
langchain_openai==0.0.5
|
||||
PyPDF2==3.0.1
|
||||
python-dotenv==1.0.1
|
||||
streamlit==1.31.0
|
Loading…
Reference in New Issue
Block a user