Created the basic pdf retriever program
This commit is contained in:
parent
2dd8a9b533
commit
50983e9469
66
main.py
Normal file
66
main.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
import os
|
||||||
|
import streamlit as st
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
from langchain.text_splitter import CharacterTextSplitter
|
||||||
|
from langchain_community.vectorstores import FAISS
|
||||||
|
from langchain_openai import OpenAIEmbeddings
|
||||||
|
from langchain.text_splitter import CharacterTextSplitter
|
||||||
|
from langchain_community.vectorstores import Chroma
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extractText(uploaded_file):
|
||||||
|
file = PdfReader(uploaded_file)
|
||||||
|
text_data = ''
|
||||||
|
for pg in file.pages:
|
||||||
|
text_data += pg.extract_text()
|
||||||
|
return text_data
|
||||||
|
|
||||||
|
def chunkText(pdf_text):
|
||||||
|
text_chunker = CharacterTextSplitter(
|
||||||
|
separator="\n",
|
||||||
|
chunk_size=1000,
|
||||||
|
chunk_overlap=200,
|
||||||
|
length_function=len,
|
||||||
|
is_separator_regex=False,
|
||||||
|
)
|
||||||
|
text_sections = text_chunker.create_documents([pdf_text])
|
||||||
|
return text_sections
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def app():
|
||||||
|
st.set_page_config(page_title='AI reads your PDFs')
|
||||||
|
st.title("AI reads your PDFs")
|
||||||
|
|
||||||
|
file = st.file_uploader('Upload your PDF file', type='pdf')
|
||||||
|
if file is not None:
|
||||||
|
# Extract Text from the document, split it into chunks, embed each chunk and load it into the vector store.
|
||||||
|
text = extractText(file)
|
||||||
|
text_sections = chunkText(text)
|
||||||
|
query = st.text_input("Enter your query")
|
||||||
|
# # Query using vector embeddings
|
||||||
|
# db = Chroma.from_documents(text_sections, OpenAIEmbeddings())
|
||||||
|
# embedding_vector = OpenAIEmbeddings().embed_query(query)
|
||||||
|
# docs = db.similarity_search_by_vector(embedding_vector)
|
||||||
|
if st.button('retrieve from PDF'):
|
||||||
|
retriever = FAISS.from_documents(text_sections, OpenAIEmbeddings()).as_retriever()
|
||||||
|
docs = retriever.get_relevant_documents(
|
||||||
|
query
|
||||||
|
)
|
||||||
|
# Print for db embeddings
|
||||||
|
st.write(docs[0].page_content)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
app()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
langchain==0.1.6
|
||||||
|
langchain_community==0.0.19
|
||||||
|
langchain_openai==0.0.5
|
||||||
|
PyPDF2==3.0.1
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
streamlit==1.31.0
|
Loading…
Reference in New Issue
Block a user