From 3c8d08dc032a6ea8ef0180ce2ef171f6ca8eb716 Mon Sep 17 00:00:00 2001 From: Christbru Date: Sun, 19 Oct 2025 11:59:13 -0500 Subject: [PATCH] Fix files --- rust-engine/Cargo.lock | 1 + rust-engine/Cargo.toml | 1 + rust-engine/src/file_worker.rs | 47 ++++++++++++++++++++++++++-- rust-engine/src/models.rs | 2 ++ rust-engine/src/worker.rs | 2 ++ web-app/server.mjs | 56 +++++++++------------------------- 6 files changed, 66 insertions(+), 43 deletions(-) diff --git a/rust-engine/Cargo.lock b/rust-engine/Cargo.lock index b5c21c0..3da1251 100644 --- a/rust-engine/Cargo.lock +++ b/rust-engine/Cargo.lock @@ -1731,6 +1731,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "base64", "bytes", "chrono", "dotenvy", diff --git a/rust-engine/Cargo.toml b/rust-engine/Cargo.toml index ccbb0b2..816cab3 100644 --- a/rust-engine/Cargo.toml +++ b/rust-engine/Cargo.toml @@ -24,3 +24,4 @@ futures-util = "0.3" lazy_static = "1.4" bytes = "1.4" pdf-extract = "0.6" +base64 = "0.22" diff --git a/rust-engine/src/file_worker.rs b/rust-engine/src/file_worker.rs index 4316d67..339edc0 100644 --- a/rust-engine/src/file_worker.rs +++ b/rust-engine/src/file_worker.rs @@ -2,6 +2,7 @@ use crate::gemini_client::{demo_text_embedding, generate_text_with_model, DEMO_E use crate::vector; use crate::vector_db::QdrantClient; use anyhow::{anyhow, Context, Result}; +use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine as _}; use pdf_extract::extract_text; use sqlx::MySqlPool; use std::path::PathBuf; @@ -87,17 +88,37 @@ impl FileWorker { warn!(file_id, %filename, %path, "extracted excerpt is empty; prompts may lack context"); } + let (raw_base64, raw_truncated) = match read_file_base64(&path).await { + Ok(tuple) => tuple, + Err(err) => { + warn!(file_id, %filename, %path, error = ?err, "failed to read raw file bytes for prompt"); + (String::new(), false) + } + }; + let excerpt_note = if truncated { "(excerpt truncated for prompt size)" } else { "" }; + let raw_note = if raw_truncated { + "(base64 truncated to first 512KB)" + } else { + "(base64)" + }; + // Stage 1: Gemini 2.5 Flash for description - let desc_prompt = format!( + let mut desc_prompt = format!( "You are reviewing the PDF file '{filename}'. Use the following extracted text {excerpt_note} to produce a concise, factual description and key highlights that will help downstream search and reasoning.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---", file_excerpt ); + if !raw_base64.is_empty() { + desc_prompt.push_str(&format!( + "\n\n--- BEGIN RAW FILE {raw_note} ---\n{}\n--- END RAW FILE ---", + raw_base64 + )); + } let desc = generate_text_with_model("gemini-2.5-flash", &desc_prompt) .await .unwrap_or_else(|e| format!("[desc error: {}]", e)); @@ -110,10 +131,16 @@ impl FileWorker { .await?; // Stage 2: Gemini 2.5 Pro for deep vector graph data - let vector_prompt = format!( + let mut vector_prompt = format!( "You are constructing vector search metadata for the PDF file '{filename}'.\nCurrent description: {desc}\nUse the extracted text {excerpt_note} below to derive precise keywords, thematic clusters, and relationships that are explicitly supported by the content. Provide richly structured bullet points grouped by themes.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---", file_excerpt ); + if !raw_base64.is_empty() { + vector_prompt.push_str(&format!( + "\n\n--- BEGIN RAW FILE {raw_note} ---\n{}\n--- END RAW FILE ---", + raw_base64 + )); + } let vector_graph = generate_text_with_model("gemini-2.5-pro", &vector_prompt) .await .unwrap_or_else(|e| format!("[vector error: {}]", e)); @@ -158,6 +185,7 @@ impl FileWorker { // Maximum number of characters from the extracted text to include in prompts. const MAX_EXCERPT_CHARS: usize = 4000; +const MAX_RAW_BYTES: usize = 512 * 1024; // limit base64 payload fed into prompts async fn extract_file_excerpt(path: &str) -> Result<(String, bool)> { let path_buf = PathBuf::from(path); @@ -224,3 +252,18 @@ fn collapse_whitespace(input: &str) -> String { } output.trim().to_string() } + +async fn read_file_base64(path: &str) -> Result<(String, bool)> { + let bytes = tokio::fs::read(path).await?; + if bytes.is_empty() { + return Ok((String::new(), false)); + } + let truncated = bytes.len() > MAX_RAW_BYTES; + let slice = if truncated { + &bytes[..MAX_RAW_BYTES] + } else { + &bytes[..] + }; + let encoded = BASE64_STANDARD.encode(slice); + Ok((encoded, truncated)) +} diff --git a/rust-engine/src/models.rs b/rust-engine/src/models.rs index ecea22f..401e883 100644 --- a/rust-engine/src/models.rs +++ b/rust-engine/src/models.rs @@ -11,6 +11,7 @@ pub struct FileRecord { pub created_at: Option>, pub pending_analysis: bool, // true if file is not yet ready for search pub analysis_status: String, // 'Queued', 'InProgress', 'Completed', 'Failed' + pub raw_url: Option, } impl FileRecord { @@ -28,6 +29,7 @@ impl FileRecord { created_at: None, pending_analysis: true, analysis_status: "Queued".to_string(), + raw_url: None, } } } diff --git a/rust-engine/src/worker.rs b/rust-engine/src/worker.rs index 822f762..79b02d8 100644 --- a/rust-engine/src/worker.rs +++ b/rust-engine/src/worker.rs @@ -191,6 +191,7 @@ impl Worker { "filename": filename, "path": path, "storage_url": storage_url, + "raw_url": storage_url, "description": description, "analysis_status": analysis_status, "score": score @@ -223,6 +224,7 @@ impl Worker { "filename": filename, "path": path, "storage_url": storage_url, + "raw_url": storage_url, "description": description, "analysis_status": analysis_status, "score": serde_json::Value::Null diff --git a/web-app/server.mjs b/web-app/server.mjs index cb08374..e811db1 100644 --- a/web-app/server.mjs +++ b/web-app/server.mjs @@ -2,9 +2,7 @@ import express from 'express'; import path from 'node:path'; import helmet from 'helmet'; import cors from 'cors'; -import http from 'node:http'; -import https from 'node:https'; -import { URL } from 'node:url'; +import fetch from 'node-fetch'; import { fileURLToPath } from 'node:url'; const __filename = fileURLToPath(import.meta.url); @@ -17,58 +15,34 @@ const RUST_ENGINE_BASE = process.env.RUST_ENGINE_BASE || process.env.RUST_ENGINE_URL || 'http://rust-engine:8000'; -const STORAGE_DIR = path.resolve(process.env.ASTRA_STORAGE || '/app/storage'); app.set('trust proxy', true); app.use(helmet({ contentSecurityPolicy: false })); app.use(cors()); +app.use(express.json()); + app.get('/api/healthz', (_req, res) => { res.json({ status: 'ok', upstream: RUST_ENGINE_BASE }); }); -// Proxy all /api/* calls (including POST bodies, multipart uploads, etc.) -app.use('/api', (req, res) => { - const targetUrl = new URL(req.originalUrl, RUST_ENGINE_BASE); - const client = targetUrl.protocol === 'https:' ? https : http; - - const headers = { ...req.headers, host: targetUrl.host }; - - const proxyReq = client.request( - targetUrl, - { - method: req.method, - headers, - }, - (upstream) => { - res.status(upstream.statusCode || 502); - for (const [key, value] of Object.entries(upstream.headers)) { - if (typeof value !== 'undefined') { - res.setHeader(key, value); - } - } - upstream.pipe(res); - } - ); - - proxyReq.on('error', (err) => { - console.error('API proxy error:', err); - if (!res.headersSent) { - res.status(502).json({ error: 'proxy_failed' }); - } else { - res.end(); - } - }); - - req.pipe(proxyReq); +// Proxy minimal API needed by the UI to the rust-engine container +app.post('/api/files/import-demo', async (req, res) => { + try { + const qs = req.url.includes('?') ? req.url.substring(req.url.indexOf('?')) : ''; + const url = `${RUST_ENGINE_BASE}/api/files/import-demo${qs}`; + const upstream = await fetch(url, { method: 'POST', headers: { 'content-type': 'application/json' }, body: req.body ? JSON.stringify(req.body) : undefined }); + const text = await upstream.text(); + res.status(upstream.status).type(upstream.headers.get('content-type') || 'application/json').send(text); + } catch (err) { + console.error('import-demo proxy failed:', err); + res.status(502).json({ error: 'proxy_failed' }); + } }); // Serve static frontend const distDir = path.resolve(__dirname, 'dist'); app.use(express.static(distDir)); -// Expose imported files for the UI (read-only) -app.use('/storage', express.static(STORAGE_DIR)); - // SPA fallback (Express 5 requires middleware instead of bare '*') app.use((req, res) => { res.sendFile(path.join(distDir, 'index.html'));