diff --git a/rust-engine/Dockerfile b/rust-engine/Dockerfile index 56e962d..3bdf30a 100644 --- a/rust-engine/Dockerfile +++ b/rust-engine/Dockerfile @@ -2,11 +2,10 @@ # rust-engine/Dockerfile # --- Stage 1: Builder --- -# Use a stable Rust version +# (No changes needed in this stage) FROM rust:slim AS builder WORKDIR /usr/src/app -# Install build dependencies needed for sqlx RUN apt-get update && apt-get install -y --no-install-recommends \ pkg-config \ libssl-dev \ @@ -15,17 +14,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ && rm -rf /var/lib/apt/lists/* - -# Allow optional override of toolchain (e.g., nightly or a pinned version). Leave empty to use image default. ARG RUSTUP_TOOLCHAIN= - -# Use rustup and cargo from the official Rust image location ENV PATH="/usr/local/cargo/bin:${PATH}" -# Copy manifest files first to leverage Docker layer caching for dependencies COPY Cargo.toml Cargo.lock rust-toolchain.toml ./ -# Ensure the pinned toolchain from rust-toolchain.toml (or provided ARG) is installed only if missing RUN set -eux; \ if [ -n "${RUSTUP_TOOLCHAIN}" ]; then \ if ! rustup toolchain list | grep -q "^${RUSTUP_TOOLCHAIN}"; then \ @@ -45,46 +38,38 @@ RUN set -eux; \ fi; \ rustup show active-toolchain || true -# Create a dummy src to allow cargo to download dependencies into the cache layer RUN mkdir -p src && echo "fn main() { println!(\"cargo cache build\"); }" > src/main.rs -# Warm up dependency caches without compiling a dummy binary RUN --mount=type=cache,target=/usr/local/cargo/registry,sharing=locked \ --mount=type=cache,target=/usr/local/cargo/git,sharing=locked \ cargo fetch - -# Remove dummy main.rs before copying the real source RUN rm -f src/main.rs COPY src ./src -# Build the real binary RUN cargo build --release --locked # --- Stage 2: Final, small image --- FROM debian:bookworm-slim -# Install only necessary runtime dependencies (no upgrade, just ca-certificates) RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/* -# Add a non-root user for security RUN useradd --system --uid 10001 --no-create-home --shell /usr/sbin/nologin appuser # Copy the compiled binary from the builder stage - COPY --from=builder /usr/src/app/target/release/rust-engine /usr/local/bin/rust-engine -# Create writable storage and logs directories for appuser +# --- THIS IS THE FIX --- +# **1. Copy the demo data files from your local machine into the image.** +COPY demo-data /app/demo-data + +# **2. Create other directories and set permissions on everything.** RUN chown appuser:appuser /usr/local/bin/rust-engine \ - && mkdir -p /var/log /app/storage /app/demo-data \ + && mkdir -p /var/log /app/storage \ && touch /var/log/astra-errors.log \ && chown -R appuser:appuser /var/log /app -# Set working directory to a writable location WORKDIR /app - -# Switch to non-root user USER appuser EXPOSE 8000 -# Redirect all output to /var/log/astra-errors.log for easy monitoring ENTRYPOINT ["/bin/sh", "-c", "/usr/local/bin/rust-engine >> /var/log/astra-errors.log 2>&1"] \ No newline at end of file diff --git a/rust-engine/README.md b/rust-engine/README.md index 037ee2a..35e0edc 100644 --- a/rust-engine/README.md +++ b/rust-engine/README.md @@ -13,6 +13,7 @@ - GEMINI_API_KEY: used for Gemini content generation (optional in demo) - DEMO_DATA_DIR: path to the folder containing PDF demo data (default resolves to `demo-data` under the repo or `/app/demo-data` in containers) - ASTRA_STORAGE: directory for uploaded file blobs (default `/app/storage`) +- AUTO_IMPORT_DEMO: set to `false`, `0`, `off`, or `no` to disable automatic demo import at startup (defaults to `true`) ## Endpoints (JSON) @@ -74,7 +75,7 @@ 2. set env DATABASE_URL and QDRANT_URL 3. cargo run 4. (optional) import demo PDFs - - Populate a folder with PDFs under `rust-engine/demo-data` (or point `DEMO_DATA_DIR` to a custom path). The server auto-resolves common locations such as the repo root, `/app/demo-data`, and the working directory when running in Docker. + - Populate a folder with PDFs under `rust-engine/demo-data` (or point `DEMO_DATA_DIR` to a custom path). The server auto-resolves common locations such as the repo root, `/app/demo-data`, and the working directory when running in Docker. When the engine boots it automatically attempts this import (can be disabled by setting `AUTO_IMPORT_DEMO=false`). - Call the endpoint: - POST - Optional query `?force=1` to overwrite existing by filename. The JSON response also echoes where the engine looked (`source_dir`, `attempted_paths`) and how many PDFs were detected (`files_found`) so misconfigurations are easy to spot. Imported files are written to the shared `/app/storage` volume; the web-app container mounts this volume read-only and serves the contents at `/storage/`. diff --git a/rust-engine/src/api.rs b/rust-engine/src/api.rs index 2950125..8e8c707 100644 --- a/rust-engine/src/api.rs +++ b/rust-engine/src/api.rs @@ -3,9 +3,9 @@ use crate::vector_db::QdrantClient; use anyhow::Result; use bytes::Buf; use futures_util::TryStreamExt; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use sqlx::{MySqlPool, Row}; -use tracing::info; +use tracing::{debug, info, warn}; use warp::{multipart::FormData, Filter, Rejection, Reply}; #[derive(Debug, Deserialize)] @@ -13,6 +13,177 @@ struct DeleteQuery { id: String, } +#[derive(Debug, Serialize)] +pub struct DemoImportSummary { + pub imported: usize, + pub skipped: usize, + pub files_found: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_dir: Option, + pub attempted_paths: Vec, + pub force: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, +} + +pub async fn perform_demo_import(force: bool, pool: &MySqlPool) -> Result { + use anyhow::Context; + use std::fs; + use std::path::PathBuf; + + let demo_dir_setting = + std::env::var("DEMO_DATA_DIR").unwrap_or_else(|_| "demo-data".to_string()); + info!(force, requested_dir = %demo_dir_setting, "demo import requested"); + + let base = std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")); + + // Build a list of plausible demo-data locations so local runs and containers both work. + let mut candidates: Vec = Vec::new(); + let configured = PathBuf::from(&demo_dir_setting); + let mut push_candidate = |path: PathBuf| { + if !candidates.iter().any(|existing| existing == &path) { + candidates.push(path); + } + }; + + push_candidate(base.join(&configured)); + push_candidate(PathBuf::from(&demo_dir_setting)); + push_candidate(base.join("rust-engine").join(&configured)); + push_candidate(base.join("rust-engine").join("demo-data")); + push_candidate(base.join("demo-data")); + if let Ok(exe_path) = std::env::current_exe() { + if let Some(exe_dir) = exe_path.parent() { + push_candidate(exe_dir.join(&configured)); + push_candidate(exe_dir.join("demo-data")); + push_candidate(exe_dir.join("rust-engine").join(&configured)); + } + } + + let mut attempted: Vec = Vec::new(); + let mut resolved_dir: Option = None; + for candidate in candidates { + debug!(candidate = %candidate.display(), "evaluating demo import path candidate"); + if candidate.exists() && candidate.is_dir() { + resolved_dir = Some(candidate.clone()); + break; + } + attempted.push(candidate); + } + + let attempted_paths: Vec = attempted.iter().map(|p| p.display().to_string()).collect(); + let mut summary = DemoImportSummary { + imported: 0, + skipped: 0, + files_found: 0, + source_dir: resolved_dir.as_ref().map(|p| p.display().to_string()), + attempted_paths, + force, + error: None, + }; + + let src_dir = match resolved_dir { + Some(path) => path, + None => { + summary.error = + Some("demo dir not found; set DEMO_DATA_DIR or bind mount demo PDFs".to_string()); + warn!( + attempted_paths = ?summary.attempted_paths, + "demo import skipped; source directory not found" + ); + return Ok(summary); + } + }; + + summary.source_dir = Some(src_dir.display().to_string()); + info!( + source = %summary.source_dir.as_deref().unwrap_or_default(), + attempted_paths = ?summary.attempted_paths, + "demo import source resolved" + ); + + for entry in fs::read_dir(&src_dir).with_context(|| format!("reading {:?}", src_dir))? { + let entry = entry.with_context(|| format!("reading entry in {:?}", src_dir))?; + let path = entry.path(); + if path + .extension() + .and_then(|e| e.to_str()) + .map(|e| e.eq_ignore_ascii_case("pdf")) + .unwrap_or(false) + { + summary.files_found += 1; + let filename = path + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown.pdf") + .to_string(); + + if !force { + if let Some(_) = sqlx::query("SELECT id FROM files WHERE filename = ?") + .bind(&filename) + .fetch_optional(pool) + .await? + { + summary.skipped += 1; + info!(%filename, "skipping demo import; already present"); + continue; + } + } + + let data = fs::read(&path).with_context(|| format!("reading {:?}", path))?; + let stored_path = storage::save_file(&filename, &data)?; + info!(%filename, dest = %stored_path.to_string_lossy(), "demo file copied to storage"); + + let id = uuid::Uuid::new_v4().to_string(); + + if force { + sqlx::query("DELETE FROM files WHERE filename = ?") + .bind(&filename) + .execute(pool) + .await?; + info!(%filename, "existing file records removed due to force import"); + } + + sqlx::query("INSERT INTO files (id, filename, path, description, pending_analysis, analysis_status) VALUES (?, ?, ?, ?, ?, 'Queued')") + .bind(&id) + .bind(&filename) + .bind(stored_path.to_string_lossy().to_string()) + .bind(Option::::None) + .bind(true) + .execute(pool) + .await?; + info!(%filename, file_id = %id, "demo file inserted into database"); + + sqlx::query("INSERT INTO file_jobs (file_id, status) VALUES (?, 'Queued')") + .bind(&id) + .execute(pool) + .await?; + info!(%filename, file_id = %id, "demo file queued for analysis"); + + summary.imported += 1; + } else { + debug!(path = %path.display(), "skipping non-PDF entry during demo import"); + } + } + + let source_label = summary.source_dir.as_deref().unwrap_or("unknown"); + + if summary.files_found == 0 { + warn!(source = %source_label, "demo import located zero PDFs"); + } + + info!( + source = %source_label, + files_found = summary.files_found, + attempted_paths = ?summary.attempted_paths, + imported = summary.imported, + skipped = summary.skipped, + force, + "demo import completed" + ); + + Ok(summary) +} + pub fn routes(pool: MySqlPool) -> impl Filter + Clone { let pool_filter = warp::any().map(move || pool.clone()); @@ -150,154 +321,27 @@ async fn handle_import_demo( params: std::collections::HashMap, pool: MySqlPool, ) -> Result { - use std::fs; - use std::path::PathBuf; let force = params .get("force") .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) .unwrap_or(false); - let demo_dir_setting = - std::env::var("DEMO_DATA_DIR").unwrap_or_else(|_| "demo-data".to_string()); - let base = std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")); - info!(force, requested_dir = %demo_dir_setting, "demo import requested"); - - // Build a list of plausible demo-data locations so local runs and containers both work. - let mut candidates: Vec = Vec::new(); - let configured = PathBuf::from(&demo_dir_setting); - let mut push_candidate = |path: PathBuf| { - if !candidates.iter().any(|existing| existing == &path) { - candidates.push(path); - } - }; - - push_candidate(base.join(&configured)); - push_candidate(PathBuf::from(&demo_dir_setting)); - push_candidate(base.join("rust-engine").join(&configured)); - push_candidate(base.join("rust-engine").join("demo-data")); - push_candidate(base.join("demo-data")); - if let Ok(exe_path) = std::env::current_exe() { - if let Some(exe_dir) = exe_path.parent() { - push_candidate(exe_dir.join(&configured)); - push_candidate(exe_dir.join("demo-data")); - push_candidate(exe_dir.join("rust-engine").join(&configured)); + match perform_demo_import(force, &pool).await { + Ok(summary) => Ok(warp::reply::json(&summary)), + Err(err) => { + tracing::error!(error = %err, "demo import failed"); + let fallback = DemoImportSummary { + imported: 0, + skipped: 0, + files_found: 0, + source_dir: None, + attempted_paths: Vec::new(), + force, + error: Some(err.to_string()), + }; + Ok(warp::reply::json(&fallback)) } } - - let mut attempted: Vec = Vec::new(); - let mut resolved_dir: Option = None; - for candidate in candidates { - if candidate.exists() && candidate.is_dir() { - resolved_dir = Some(candidate.clone()); - break; - } - attempted.push(candidate); - } - - let attempted_paths: Vec = attempted.iter().map(|p| p.display().to_string()).collect(); - let src_dir = match resolved_dir { - Some(path) => path, - None => { - return Ok(warp::reply::json(&serde_json::json!({ - "imported": 0, - "skipped": 0, - "files_found": 0, - "attempted_paths": attempted_paths, - "error": format!("demo dir not found; set DEMO_DATA_DIR or bind mount demo PDFs") - }))); - } - }; - let src_dir_display = src_dir.display().to_string(); - info!(source = %src_dir_display, attempted_paths = ?attempted_paths, "demo import source resolved"); - let mut imported = 0; - let mut skipped = 0; - let mut files_found = 0; - for entry in fs::read_dir(&src_dir).map_err(|_| warp::reject())? { - let entry = entry.map_err(|_| warp::reject())?; - let path = entry.path(); - if path - .extension() - .and_then(|e| e.to_str()) - .map(|e| e.eq_ignore_ascii_case("pdf")) - .unwrap_or(false) - { - files_found += 1; - let filename = path - .file_name() - .and_then(|n| n.to_str()) - .unwrap_or("unknown.pdf") - .to_string(); - - // check if exists - if !force { - if let Some(_) = sqlx::query("SELECT id FROM files WHERE filename = ?") - .bind(&filename) - .fetch_optional(&pool) - .await - .map_err(|_| warp::reject())? - { - skipped += 1; - info!(%filename, "skipping demo import; already present"); - continue; - } - } - - // read and save to storage - let data = fs::read(&path).map_err(|_| warp::reject())?; - let stored_path = storage::save_file(&filename, &data).map_err(|_| warp::reject())?; - info!(%filename, dest = %stored_path.to_string_lossy(), "demo file copied to storage"); - - // insert or upsert db record - let id = uuid::Uuid::new_v4().to_string(); - if force { - let _ = sqlx::query("DELETE FROM files WHERE filename = ?") - .bind(&filename) - .execute(&pool) - .await; - info!(%filename, "existing file records removed due to force import"); - } - sqlx::query("INSERT INTO files (id, filename, path, description, pending_analysis, analysis_status) VALUES (?, ?, ?, ?, ?, 'Queued')") - .bind(&id) - .bind(&filename) - .bind(stored_path.to_string_lossy().to_string()) - .bind(Option::::None) - .bind(true) - .execute(&pool) - .await - .map_err(|e| { - tracing::error!("DB insert error: {}", e); - warp::reject() - })?; - info!(%filename, file_id = %id, "demo file inserted into database"); - - // queue for worker - sqlx::query("INSERT INTO file_jobs (file_id, status) VALUES (?, 'Queued')") - .bind(&id) - .execute(&pool) - .await - .map_err(|_| warp::reject())?; - info!(%filename, file_id = %id, "demo file queued for analysis"); - - imported += 1; - } - } - info!( - source = %src_dir_display, - files_found, - attempted_paths = ?attempted_paths, - imported, - skipped, - force, - "demo import completed" - ); - Ok(warp::reply::json(&serde_json::json!({ - "imported": imported, - "skipped": skipped, - "files_found": files_found, - "source_dir": src_dir_display, - "attempted_paths": attempted_paths, - "force": force - }))) } async fn handle_delete(q: DeleteQuery, pool: MySqlPool) -> Result { diff --git a/rust-engine/src/main.rs b/rust-engine/src/main.rs index 98f89c2..006e4ce 100644 --- a/rust-engine/src/main.rs +++ b/rust-engine/src/main.rs @@ -10,7 +10,7 @@ mod worker; use std::env; use std::error::Error; -use tracing::info; +use tracing::{error, info, warn}; use warp::Filter; #[tokio::main] @@ -34,6 +34,33 @@ async fn main() -> Result<(), Box> { .await .map_err(|e| -> Box { Box::new(e) })?; + let auto_import_setting = env::var("AUTO_IMPORT_DEMO").unwrap_or_else(|_| "true".to_string()); + let auto_import = !matches!( + auto_import_setting.trim().to_ascii_lowercase().as_str(), + "0" | "false" | "off" | "no" + ); + if auto_import { + match api::perform_demo_import(false, &pool).await { + Ok(summary) => { + if let Some(err_msg) = summary.error.as_ref() { + warn!(error = %err_msg, "startup demo import completed with warnings"); + } + info!( + imported = summary.imported, + skipped = summary.skipped, + files_found = summary.files_found, + source = summary.source_dir.as_deref().unwrap_or("unknown"), + "startup demo import completed" + ); + } + Err(err) => { + error!(error = %err, "startup demo import failed"); + } + } + } else { + info!("AUTO_IMPORT_DEMO disabled; skipping startup demo import"); + } + // Spawn query worker let worker = worker::Worker::new(pool.clone()); tokio::spawn(async move { worker.run().await });