Fix files
This commit is contained in:
parent
18081a28ba
commit
3c8d08dc03
6 changed files with 66 additions and 43 deletions
1
rust-engine/Cargo.lock
generated
1
rust-engine/Cargo.lock
generated
|
|
@ -1731,6 +1731,7 @@ version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
|
"base64",
|
||||||
"bytes",
|
"bytes",
|
||||||
"chrono",
|
"chrono",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
|
||||||
|
|
@ -24,3 +24,4 @@ futures-util = "0.3"
|
||||||
lazy_static = "1.4"
|
lazy_static = "1.4"
|
||||||
bytes = "1.4"
|
bytes = "1.4"
|
||||||
pdf-extract = "0.6"
|
pdf-extract = "0.6"
|
||||||
|
base64 = "0.22"
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ use crate::gemini_client::{demo_text_embedding, generate_text_with_model, DEMO_E
|
||||||
use crate::vector;
|
use crate::vector;
|
||||||
use crate::vector_db::QdrantClient;
|
use crate::vector_db::QdrantClient;
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine as _};
|
||||||
use pdf_extract::extract_text;
|
use pdf_extract::extract_text;
|
||||||
use sqlx::MySqlPool;
|
use sqlx::MySqlPool;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
@ -87,17 +88,37 @@ impl FileWorker {
|
||||||
warn!(file_id, %filename, %path, "extracted excerpt is empty; prompts may lack context");
|
warn!(file_id, %filename, %path, "extracted excerpt is empty; prompts may lack context");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let (raw_base64, raw_truncated) = match read_file_base64(&path).await {
|
||||||
|
Ok(tuple) => tuple,
|
||||||
|
Err(err) => {
|
||||||
|
warn!(file_id, %filename, %path, error = ?err, "failed to read raw file bytes for prompt");
|
||||||
|
(String::new(), false)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let excerpt_note = if truncated {
|
let excerpt_note = if truncated {
|
||||||
"(excerpt truncated for prompt size)"
|
"(excerpt truncated for prompt size)"
|
||||||
} else {
|
} else {
|
||||||
""
|
""
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let raw_note = if raw_truncated {
|
||||||
|
"(base64 truncated to first 512KB)"
|
||||||
|
} else {
|
||||||
|
"(base64)"
|
||||||
|
};
|
||||||
|
|
||||||
// Stage 1: Gemini 2.5 Flash for description
|
// Stage 1: Gemini 2.5 Flash for description
|
||||||
let desc_prompt = format!(
|
let mut desc_prompt = format!(
|
||||||
"You are reviewing the PDF file '{filename}'. Use the following extracted text {excerpt_note} to produce a concise, factual description and key highlights that will help downstream search and reasoning.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---",
|
"You are reviewing the PDF file '{filename}'. Use the following extracted text {excerpt_note} to produce a concise, factual description and key highlights that will help downstream search and reasoning.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---",
|
||||||
file_excerpt
|
file_excerpt
|
||||||
);
|
);
|
||||||
|
if !raw_base64.is_empty() {
|
||||||
|
desc_prompt.push_str(&format!(
|
||||||
|
"\n\n--- BEGIN RAW FILE {raw_note} ---\n{}\n--- END RAW FILE ---",
|
||||||
|
raw_base64
|
||||||
|
));
|
||||||
|
}
|
||||||
let desc = generate_text_with_model("gemini-2.5-flash", &desc_prompt)
|
let desc = generate_text_with_model("gemini-2.5-flash", &desc_prompt)
|
||||||
.await
|
.await
|
||||||
.unwrap_or_else(|e| format!("[desc error: {}]", e));
|
.unwrap_or_else(|e| format!("[desc error: {}]", e));
|
||||||
|
|
@ -110,10 +131,16 @@ impl FileWorker {
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// Stage 2: Gemini 2.5 Pro for deep vector graph data
|
// Stage 2: Gemini 2.5 Pro for deep vector graph data
|
||||||
let vector_prompt = format!(
|
let mut vector_prompt = format!(
|
||||||
"You are constructing vector search metadata for the PDF file '{filename}'.\nCurrent description: {desc}\nUse the extracted text {excerpt_note} below to derive precise keywords, thematic clusters, and relationships that are explicitly supported by the content. Provide richly structured bullet points grouped by themes.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---",
|
"You are constructing vector search metadata for the PDF file '{filename}'.\nCurrent description: {desc}\nUse the extracted text {excerpt_note} below to derive precise keywords, thematic clusters, and relationships that are explicitly supported by the content. Provide richly structured bullet points grouped by themes.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---",
|
||||||
file_excerpt
|
file_excerpt
|
||||||
);
|
);
|
||||||
|
if !raw_base64.is_empty() {
|
||||||
|
vector_prompt.push_str(&format!(
|
||||||
|
"\n\n--- BEGIN RAW FILE {raw_note} ---\n{}\n--- END RAW FILE ---",
|
||||||
|
raw_base64
|
||||||
|
));
|
||||||
|
}
|
||||||
let vector_graph = generate_text_with_model("gemini-2.5-pro", &vector_prompt)
|
let vector_graph = generate_text_with_model("gemini-2.5-pro", &vector_prompt)
|
||||||
.await
|
.await
|
||||||
.unwrap_or_else(|e| format!("[vector error: {}]", e));
|
.unwrap_or_else(|e| format!("[vector error: {}]", e));
|
||||||
|
|
@ -158,6 +185,7 @@ impl FileWorker {
|
||||||
|
|
||||||
// Maximum number of characters from the extracted text to include in prompts.
|
// Maximum number of characters from the extracted text to include in prompts.
|
||||||
const MAX_EXCERPT_CHARS: usize = 4000;
|
const MAX_EXCERPT_CHARS: usize = 4000;
|
||||||
|
const MAX_RAW_BYTES: usize = 512 * 1024; // limit base64 payload fed into prompts
|
||||||
|
|
||||||
async fn extract_file_excerpt(path: &str) -> Result<(String, bool)> {
|
async fn extract_file_excerpt(path: &str) -> Result<(String, bool)> {
|
||||||
let path_buf = PathBuf::from(path);
|
let path_buf = PathBuf::from(path);
|
||||||
|
|
@ -224,3 +252,18 @@ fn collapse_whitespace(input: &str) -> String {
|
||||||
}
|
}
|
||||||
output.trim().to_string()
|
output.trim().to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn read_file_base64(path: &str) -> Result<(String, bool)> {
|
||||||
|
let bytes = tokio::fs::read(path).await?;
|
||||||
|
if bytes.is_empty() {
|
||||||
|
return Ok((String::new(), false));
|
||||||
|
}
|
||||||
|
let truncated = bytes.len() > MAX_RAW_BYTES;
|
||||||
|
let slice = if truncated {
|
||||||
|
&bytes[..MAX_RAW_BYTES]
|
||||||
|
} else {
|
||||||
|
&bytes[..]
|
||||||
|
};
|
||||||
|
let encoded = BASE64_STANDARD.encode(slice);
|
||||||
|
Ok((encoded, truncated))
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ pub struct FileRecord {
|
||||||
pub created_at: Option<DateTime<Utc>>,
|
pub created_at: Option<DateTime<Utc>>,
|
||||||
pub pending_analysis: bool, // true if file is not yet ready for search
|
pub pending_analysis: bool, // true if file is not yet ready for search
|
||||||
pub analysis_status: String, // 'Queued', 'InProgress', 'Completed', 'Failed'
|
pub analysis_status: String, // 'Queued', 'InProgress', 'Completed', 'Failed'
|
||||||
|
pub raw_url: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FileRecord {
|
impl FileRecord {
|
||||||
|
|
@ -28,6 +29,7 @@ impl FileRecord {
|
||||||
created_at: None,
|
created_at: None,
|
||||||
pending_analysis: true,
|
pending_analysis: true,
|
||||||
analysis_status: "Queued".to_string(),
|
analysis_status: "Queued".to_string(),
|
||||||
|
raw_url: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -191,6 +191,7 @@ impl Worker {
|
||||||
"filename": filename,
|
"filename": filename,
|
||||||
"path": path,
|
"path": path,
|
||||||
"storage_url": storage_url,
|
"storage_url": storage_url,
|
||||||
|
"raw_url": storage_url,
|
||||||
"description": description,
|
"description": description,
|
||||||
"analysis_status": analysis_status,
|
"analysis_status": analysis_status,
|
||||||
"score": score
|
"score": score
|
||||||
|
|
@ -223,6 +224,7 @@ impl Worker {
|
||||||
"filename": filename,
|
"filename": filename,
|
||||||
"path": path,
|
"path": path,
|
||||||
"storage_url": storage_url,
|
"storage_url": storage_url,
|
||||||
|
"raw_url": storage_url,
|
||||||
"description": description,
|
"description": description,
|
||||||
"analysis_status": analysis_status,
|
"analysis_status": analysis_status,
|
||||||
"score": serde_json::Value::Null
|
"score": serde_json::Value::Null
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,7 @@ import express from 'express';
|
||||||
import path from 'node:path';
|
import path from 'node:path';
|
||||||
import helmet from 'helmet';
|
import helmet from 'helmet';
|
||||||
import cors from 'cors';
|
import cors from 'cors';
|
||||||
import http from 'node:http';
|
import fetch from 'node-fetch';
|
||||||
import https from 'node:https';
|
|
||||||
import { URL } from 'node:url';
|
|
||||||
import { fileURLToPath } from 'node:url';
|
import { fileURLToPath } from 'node:url';
|
||||||
|
|
||||||
const __filename = fileURLToPath(import.meta.url);
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
|
@ -17,58 +15,34 @@ const RUST_ENGINE_BASE =
|
||||||
process.env.RUST_ENGINE_BASE ||
|
process.env.RUST_ENGINE_BASE ||
|
||||||
process.env.RUST_ENGINE_URL ||
|
process.env.RUST_ENGINE_URL ||
|
||||||
'http://rust-engine:8000';
|
'http://rust-engine:8000';
|
||||||
const STORAGE_DIR = path.resolve(process.env.ASTRA_STORAGE || '/app/storage');
|
|
||||||
|
|
||||||
app.set('trust proxy', true);
|
app.set('trust proxy', true);
|
||||||
app.use(helmet({ contentSecurityPolicy: false }));
|
app.use(helmet({ contentSecurityPolicy: false }));
|
||||||
app.use(cors());
|
app.use(cors());
|
||||||
|
app.use(express.json());
|
||||||
|
|
||||||
app.get('/api/healthz', (_req, res) => {
|
app.get('/api/healthz', (_req, res) => {
|
||||||
res.json({ status: 'ok', upstream: RUST_ENGINE_BASE });
|
res.json({ status: 'ok', upstream: RUST_ENGINE_BASE });
|
||||||
});
|
});
|
||||||
|
|
||||||
// Proxy all /api/* calls (including POST bodies, multipart uploads, etc.)
|
// Proxy minimal API needed by the UI to the rust-engine container
|
||||||
app.use('/api', (req, res) => {
|
app.post('/api/files/import-demo', async (req, res) => {
|
||||||
const targetUrl = new URL(req.originalUrl, RUST_ENGINE_BASE);
|
try {
|
||||||
const client = targetUrl.protocol === 'https:' ? https : http;
|
const qs = req.url.includes('?') ? req.url.substring(req.url.indexOf('?')) : '';
|
||||||
|
const url = `${RUST_ENGINE_BASE}/api/files/import-demo${qs}`;
|
||||||
const headers = { ...req.headers, host: targetUrl.host };
|
const upstream = await fetch(url, { method: 'POST', headers: { 'content-type': 'application/json' }, body: req.body ? JSON.stringify(req.body) : undefined });
|
||||||
|
const text = await upstream.text();
|
||||||
const proxyReq = client.request(
|
res.status(upstream.status).type(upstream.headers.get('content-type') || 'application/json').send(text);
|
||||||
targetUrl,
|
} catch (err) {
|
||||||
{
|
console.error('import-demo proxy failed:', err);
|
||||||
method: req.method,
|
res.status(502).json({ error: 'proxy_failed' });
|
||||||
headers,
|
}
|
||||||
},
|
|
||||||
(upstream) => {
|
|
||||||
res.status(upstream.statusCode || 502);
|
|
||||||
for (const [key, value] of Object.entries(upstream.headers)) {
|
|
||||||
if (typeof value !== 'undefined') {
|
|
||||||
res.setHeader(key, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
upstream.pipe(res);
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
proxyReq.on('error', (err) => {
|
|
||||||
console.error('API proxy error:', err);
|
|
||||||
if (!res.headersSent) {
|
|
||||||
res.status(502).json({ error: 'proxy_failed' });
|
|
||||||
} else {
|
|
||||||
res.end();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
req.pipe(proxyReq);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Serve static frontend
|
// Serve static frontend
|
||||||
const distDir = path.resolve(__dirname, 'dist');
|
const distDir = path.resolve(__dirname, 'dist');
|
||||||
app.use(express.static(distDir));
|
app.use(express.static(distDir));
|
||||||
|
|
||||||
// Expose imported files for the UI (read-only)
|
|
||||||
app.use('/storage', express.static(STORAGE_DIR));
|
|
||||||
|
|
||||||
// SPA fallback (Express 5 requires middleware instead of bare '*')
|
// SPA fallback (Express 5 requires middleware instead of bare '*')
|
||||||
app.use((req, res) => {
|
app.use((req, res) => {
|
||||||
res.sendFile(path.join(distDir, 'index.html'));
|
res.sendFile(path.join(distDir, 'index.html'));
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue