CodeRED-Astra/rust-engine/src/file_worker.rs
2025-10-19 11:59:13 -05:00

269 lines
9.5 KiB
Rust

use crate::gemini_client::{demo_text_embedding, generate_text_with_model, DEMO_EMBED_DIM};
use crate::vector;
use crate::vector_db::QdrantClient;
use anyhow::{anyhow, Context, Result};
use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine as _};
use pdf_extract::extract_text;
use sqlx::MySqlPool;
use std::path::PathBuf;
use tracing::{error, info, warn};
pub struct FileWorker {
pool: MySqlPool,
qdrant: QdrantClient,
}
impl FileWorker {
pub fn new(pool: MySqlPool) -> Self {
let qdrant_url =
std::env::var("QDRANT_URL").unwrap_or_else(|_| "http://qdrant:6333".to_string());
let qdrant = QdrantClient::new(&qdrant_url);
Self { pool, qdrant }
}
pub async fn run(&self) {
info!("FileWorker starting");
if let Err(e) = self.qdrant.ensure_files_collection(DEMO_EMBED_DIM).await {
error!("Failed to ensure Qdrant collection: {}", e);
}
loop {
match self.fetch_and_claim().await {
Ok(Some(fid)) => {
info!("Processing file {}", fid);
if let Err(e) = self.process_file(&fid).await {
error!("Error processing file {}: {}", fid, e);
if let Err(mark_err) = self.mark_failed(&fid, &format!("{}", e)).await {
error!("Failed to mark file {} as failed: {}", fid, mark_err);
}
}
}
Ok(None) => {
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
}
Err(e) => {
error!("FileWorker fetch error: {}", e);
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
}
}
}
}
async fn fetch_and_claim(&self) -> Result<Option<String>> {
// Claim files that are queued or stuck in progress for >10min
if let Some(row) = sqlx::query(
"SELECT id FROM files WHERE (analysis_status = 'Queued' OR (analysis_status = 'InProgress' AND created_at < (NOW() - INTERVAL 10 MINUTE))) AND pending_analysis = TRUE LIMIT 1"
)
.fetch_optional(&self.pool)
.await? {
use sqlx::Row;
let id: String = row.get("id");
// Mark as in-progress
let _ = sqlx::query("UPDATE files SET analysis_status = 'InProgress' WHERE id = ?")
.bind(&id)
.execute(&self.pool)
.await?;
Ok(Some(id))
} else {
Ok(None)
}
}
async fn process_file(&self, file_id: &str) -> Result<()> {
use sqlx::Row;
let row = sqlx::query("SELECT filename, path FROM files WHERE id = ?")
.bind(file_id)
.fetch_one(&self.pool)
.await?;
let filename: String = row.get("filename");
let path: String = row.get("path");
let (file_excerpt, truncated) = match extract_file_excerpt(&path).await {
Ok(res) => res,
Err(err) => {
error!(file_id, %filename, %path, error = ?err, "failed to extract text from file; continuing with filename only");
(String::new(), false)
}
};
if file_excerpt.is_empty() {
warn!(file_id, %filename, %path, "extracted excerpt is empty; prompts may lack context");
}
let (raw_base64, raw_truncated) = match read_file_base64(&path).await {
Ok(tuple) => tuple,
Err(err) => {
warn!(file_id, %filename, %path, error = ?err, "failed to read raw file bytes for prompt");
(String::new(), false)
}
};
let excerpt_note = if truncated {
"(excerpt truncated for prompt size)"
} else {
""
};
let raw_note = if raw_truncated {
"(base64 truncated to first 512KB)"
} else {
"(base64)"
};
// Stage 1: Gemini 2.5 Flash for description
let mut desc_prompt = format!(
"You are reviewing the PDF file '{filename}'. Use the following extracted text {excerpt_note} to produce a concise, factual description and key highlights that will help downstream search and reasoning.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---",
file_excerpt
);
if !raw_base64.is_empty() {
desc_prompt.push_str(&format!(
"\n\n--- BEGIN RAW FILE {raw_note} ---\n{}\n--- END RAW FILE ---",
raw_base64
));
}
let desc = generate_text_with_model("gemini-2.5-flash", &desc_prompt)
.await
.unwrap_or_else(|e| format!("[desc error: {}]", e));
sqlx::query(
"UPDATE files SET description = ?, analysis_status = 'InProgress' WHERE id = ?",
)
.bind(&desc)
.bind(file_id)
.execute(&self.pool)
.await?;
// Stage 2: Gemini 2.5 Pro for deep vector graph data
let mut vector_prompt = format!(
"You are constructing vector search metadata for the PDF file '{filename}'.\nCurrent description: {desc}\nUse the extracted text {excerpt_note} below to derive precise keywords, thematic clusters, and relationships that are explicitly supported by the content. Provide richly structured bullet points grouped by themes.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---",
file_excerpt
);
if !raw_base64.is_empty() {
vector_prompt.push_str(&format!(
"\n\n--- BEGIN RAW FILE {raw_note} ---\n{}\n--- END RAW FILE ---",
raw_base64
));
}
let vector_graph = generate_text_with_model("gemini-2.5-pro", &vector_prompt)
.await
.unwrap_or_else(|e| format!("[vector error: {}]", e));
// Stage 3: Embed and upsert to Qdrant
let emb = demo_text_embedding(&vector_graph).await?;
match self.qdrant.upsert_point(file_id, emb.clone()).await {
Ok(_) => {
let _ = vector::store_embedding(file_id, emb.clone());
}
Err(err) => {
error!("Qdrant upsert failed for {}: {}", file_id, err);
let _ = vector::store_embedding(file_id, emb);
}
}
// Mark file as ready
sqlx::query(
"UPDATE files SET pending_analysis = FALSE, analysis_status = 'Completed' WHERE id = ?",
)
.bind(file_id)
.execute(&self.pool)
.await?;
Ok(())
}
async fn mark_failed(&self, file_id: &str, reason: &str) -> Result<()> {
sqlx::query(
"UPDATE files SET analysis_status = 'Failed', pending_analysis = TRUE WHERE id = ?",
)
.bind(file_id)
.execute(&self.pool)
.await?;
sqlx::query("UPDATE files SET description = COALESCE(description, ?) WHERE id = ?")
.bind(format!("[analysis failed: {}]", reason))
.bind(file_id)
.execute(&self.pool)
.await?;
Ok(())
}
}
// Maximum number of characters from the extracted text to include in prompts.
const MAX_EXCERPT_CHARS: usize = 4000;
const MAX_RAW_BYTES: usize = 512 * 1024; // limit base64 payload fed into prompts
async fn extract_file_excerpt(path: &str) -> Result<(String, bool)> {
let path_buf = PathBuf::from(path);
let extension = path_buf
.extension()
.and_then(|e| e.to_str())
.map(|s| s.to_ascii_lowercase())
.unwrap_or_default();
let raw_text = if extension == "pdf" {
let pdf_path = path_buf.clone();
tokio::task::spawn_blocking(move || extract_text(&pdf_path))
.await
.map_err(|e| anyhow!("pdf text extraction task panicked: {e}"))??
} else {
let bytes = tokio::fs::read(&path_buf)
.await
.with_context(|| format!("reading file bytes from {path}"))?;
String::from_utf8_lossy(&bytes).into_owned()
};
let cleaned = raw_text.replace('\r', "");
let condensed = collapse_whitespace(&cleaned);
let (excerpt, truncated) = truncate_to_chars(&condensed, MAX_EXCERPT_CHARS);
Ok((excerpt, truncated))
}
fn truncate_to_chars(text: &str, max_chars: usize) -> (String, bool) {
if max_chars == 0 {
return (String::new(), !text.is_empty());
}
let mut result = String::new();
let mut chars = text.chars();
for _ in 0..max_chars {
match chars.next() {
Some(ch) => result.push(ch),
None => return (result, false),
}
}
if chars.next().is_some() {
result.push('…');
(result, true)
} else {
(result, false)
}
}
fn collapse_whitespace(input: &str) -> String {
let mut output = String::with_capacity(input.len());
let mut prev_was_ws = false;
for ch in input.chars() {
if ch.is_whitespace() {
if !prev_was_ws {
output.push(' ');
}
prev_was_ws = true;
} else {
prev_was_ws = false;
output.push(ch);
}
}
output.trim().to_string()
}
async fn read_file_base64(path: &str) -> Result<(String, bool)> {
let bytes = tokio::fs::read(path).await?;
if bytes.is_empty() {
return Ok((String::new(), false));
}
let truncated = bytes.len() > MAX_RAW_BYTES;
let slice = if truncated {
&bytes[..MAX_RAW_BYTES]
} else {
&bytes[..]
};
let encoded = BASE64_STANDARD.encode(slice);
Ok((encoded, truncated))
}