import streamlit as st import fitz # PyMuPDF for PDF text extraction import pandas as pd import json from utilities.utils import extract_text_from_pdf, chunk_text, construct_prompt_for_pdf #from utilities.utils import extract_text_from_pdf, chunk_text, construct_prompt_for_pdf from utilities.api import call_prompt prompt_template = """ You are an AI assistant analyzing a service level agreement (SLA) document. Extract the following structured information in JSON format: 1. Airline name and ground handling partner. 2. Effective date of the agreement. 3. Airport location. 4. Delay codes penalized, including target ranges and penalties. 5. Baggage loss reasons and corresponding penalties. Here is the document content: {pdf_input} Provide output in JSON format only. """ # Streamlit App Layout st.title("PDF SLA Analyzer") # File uploader for PDF st.header("Upload a PDF File") uploaded_file = st.file_uploader("Choose a PDF file to analyze", type=["pdf"]) # Text input for custom prompt st.header("Enter Your Custom Prompt") custom_prompt = st.text_area( "Custom Prompt", value=""" You are an AI assistant analyzing a service level agreement (SLA) document. Extract the following structured information in JSON format: 1. Airline name and ground handling partner. 2. Effective date of the agreement. 3. Airport location. 4. Delay codes penalized, including target ranges and penalties. 5. Baggage loss reasons and corresponding penalties. Provide output in JSON format only. """ ) # Button to process the PDF and call the API if st.button("Analyze"): if uploaded_file is not None: # Step 1: Extract text from the uploaded PDF with st.spinner("Extracting text from PDF..."): pdf_text = extract_text_from_pdf(uploaded_file) # Step 2: Chunk the text if necessary with st.spinner("Chunking large text..."): chunks = chunk_text(pdf_text) # Step 3: Query GPT-4-Turbo for each chunk and combine results all_results = [] with st.spinner("Querying GPT-4-Turbo..."): for i, chunk in enumerate(chunks): st.text(f"Processing chunk {i + 1}/{len(chunks)}...") #prompt = custom_prompt + f"\n\nHere is the document content:\n{chunk}" prompt = construct_prompt_for_pdf(prompt_template=prompt_template, pdf_input=chunk) result = call_prompt(prompt) print("result:", result) all_results.append(result) # Combine all results into one JSON object (if applicable) combined_result = "\n".join(all_results) print("combined_result:", combined_result) # Step 4: Display raw JSON response st.subheader("Raw JSON Response") try: json_data = json.loads(combined_result) # Parse JSON string into Python dict st.json(json_data) except json.JSONDecodeError: st.error("Failed to parse JSON response.") st.text(combined_result) # Show raw response if parsing fails # Step 5: Display results in a table (if JSON is valid) if 'delay_penalties' in json_data or 'baggage_loss_penalties' in json_data: st.subheader("Extracted Data Table") # Create DataFrame for delay penalties (if available) if 'delay_penalties' in json_data: delay_df = pd.DataFrame(json_data.get("delay_penalties", [])) st.write("**Delay Penalties:**") st.table(delay_df) # Create DataFrame for baggage loss penalties (if available) if 'baggage_loss_penalties' in json_data: baggage_df = pd.DataFrame(json_data.get("baggage_loss_penalties", [])) st.write("**Baggage Loss Penalties:**") st.table(baggage_df) # Display other metadata (e.g., airline, airport, etc.) metadata_keys = ["airline", "ground_handling_partner", "effective_date", "airport"] metadata = {key: json_data.get(key, "N/A") for key in metadata_keys} st.write("**Metadata:**") metadata_df = pd.DataFrame([metadata]) st.table(metadata_df) else: st.error("Please upload a PDF file to analyze.")