Created the basic pdf retriever program

This commit is contained in:
Ahmed Alrufai 2024-02-11 02:11:34 +02:00
parent 2dd8a9b533
commit 50983e9469
2 changed files with 72 additions and 0 deletions

66
main.py Normal file
View File

@ -0,0 +1,66 @@
import os
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY')
def extractText(uploaded_file):
file = PdfReader(uploaded_file)
text_data = ''
for pg in file.pages:
text_data += pg.extract_text()
return text_data
def chunkText(pdf_text):
text_chunker = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len,
is_separator_regex=False,
)
text_sections = text_chunker.create_documents([pdf_text])
return text_sections
def app():
st.set_page_config(page_title='AI reads your PDFs')
st.title("AI reads your PDFs")
file = st.file_uploader('Upload your PDF file', type='pdf')
if file is not None:
# Extract Text from the document, split it into chunks, embed each chunk and load it into the vector store.
text = extractText(file)
text_sections = chunkText(text)
query = st.text_input("Enter your query")
# # Query using vector embeddings
# db = Chroma.from_documents(text_sections, OpenAIEmbeddings())
# embedding_vector = OpenAIEmbeddings().embed_query(query)
# docs = db.similarity_search_by_vector(embedding_vector)
if st.button('retrieve from PDF'):
retriever = FAISS.from_documents(text_sections, OpenAIEmbeddings()).as_retriever()
docs = retriever.get_relevant_documents(
query
)
# Print for db embeddings
st.write(docs[0].page_content)
def main():
app()
if __name__ == '__main__':
main()

6
requirements.txt Normal file
View File

@ -0,0 +1,6 @@
langchain==0.1.6
langchain_community==0.0.19
langchain_openai==0.0.5
PyPDF2==3.0.1
python-dotenv==1.0.1
streamlit==1.31.0