diff --git a/rust-engine/Cargo.lock b/rust-engine/Cargo.lock index 57988d6..b5c21c0 100644 --- a/rust-engine/Cargo.lock +++ b/rust-engine/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "adobe-cmap-parser" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d3da9d617508ab8102c22f05bd772fc225ecb4fde431e38a45284e5c129a4bc" +dependencies = [ + "pom 1.1.0", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -31,7 +46,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -55,6 +70,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "base-x" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" + [[package]] name = "base64" version = "0.22.1" @@ -85,6 +106,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bstr" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.19.0" @@ -154,6 +186,12 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "const_fn" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f8a2ca5ac02d09563609681103aada9e1777d54fc57a5acd7a41404f9c93b6e" + [[package]] name = "core-foundation" version = "0.9.4" @@ -194,6 +232,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-queue" version = "0.3.12" @@ -242,6 +289,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "discard" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "212d0f5754cb6769937f4501cc0e67f4f4483c8d2c3e1e922ee9edbe4ab4c7c0" + [[package]] name = "displaydoc" version = "0.2.5" @@ -250,7 +303,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -268,6 +321,70 @@ dependencies = [ "serde", ] +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + [[package]] name = "encoding_rs" version = "0.8.35" @@ -304,6 +421,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "euclid" +version = "0.20.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" +dependencies = [ + "num-traits", +] + [[package]] name = "event-listener" version = "5.4.1" @@ -327,6 +453,16 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" +[[package]] +name = "flate2" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "flume" version = "0.11.1" @@ -426,7 +562,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -552,7 +688,7 @@ dependencies = [ "http", "httpdate", "mime", - "sha1", + "sha1 0.10.6", ] [[package]] @@ -946,6 +1082,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "linked-hash-map" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8dd5a6d5999d9907cda8ed67bbd137d3af8085216c2ac62de5be860bd41f304a" + [[package]] name = "linux-raw-sys" version = "0.11.0" @@ -973,6 +1115,22 @@ version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +[[package]] +name = "lopdf" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de0f69c40d6dbc68ebac4bf5aec3d9978e094e22e29fcabd045acd9cec74a9dc" +dependencies = [ + "encoding", + "flate2", + "itoa", + "linked-hash-map", + "log", + "pom 3.4.0", + "time", + "weezl", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -1011,6 +1169,16 @@ dependencies = [ "unicase", ] +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + [[package]] name = "mio" version = "1.1.0" @@ -1141,7 +1309,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -1191,6 +1359,22 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "pdf-extract" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f21fc45e1b40af7e6c7ca32af35464c1ea7a92e5d2e1465d08c8389e033240" +dependencies = [ + "adobe-cmap-parser", + "encoding", + "euclid", + "linked-hash-map", + "lopdf", + "postscript", + "type1-encoding-parser", + "unicode-normalization", +] + [[package]] name = "pem-rfc7468" version = "0.7.0" @@ -1223,7 +1407,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -1265,6 +1449,27 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "pom" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" + +[[package]] +name = "pom" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c972d8f86e943ad532d0b04e8965a749ad1d18bb981a9c7b3ae72fe7fd7744b" +dependencies = [ + "bstr", +] + +[[package]] +name = "postscript" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" + [[package]] name = "potential_utf" version = "0.1.3" @@ -1283,6 +1488,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + [[package]] name = "proc-macro2" version = "1.0.101" @@ -1430,6 +1641,12 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" + [[package]] name = "reqwest" version = "0.12.24" @@ -1519,6 +1736,7 @@ dependencies = [ "dotenvy", "futures-util", "lazy_static", + "pdf-extract", "reqwest", "serde", "serde_json", @@ -1537,6 +1755,15 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.2" @@ -1641,6 +1868,21 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "serde" version = "1.0.228" @@ -1668,7 +1910,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -1696,6 +1938,15 @@ dependencies = [ "serde", ] +[[package]] +name = "sha1" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1da05c97445caa12d05e848c4a4fcbbea29e748ac28f7e80e9b010392063770" +dependencies = [ + "sha1_smol", +] + [[package]] name = "sha1" version = "0.10.6" @@ -1707,6 +1958,12 @@ dependencies = [ "digest", ] +[[package]] +name = "sha1_smol" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" + [[package]] name = "sha2" version = "0.10.9" @@ -1752,6 +2009,12 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "slab" version = "0.4.11" @@ -1857,7 +2120,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn", + "syn 2.0.107", ] [[package]] @@ -1880,7 +2143,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn", + "syn 2.0.107", "tokio", "url", ] @@ -1918,7 +2181,7 @@ dependencies = [ "rand 0.8.5", "rsa", "serde", - "sha1", + "sha1 0.10.6", "sha2", "smallvec", "sqlx-core", @@ -2000,6 +2263,64 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "standback" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e113fb6f3de07a243d434a56ec6f186dfd51cb08448239fe7bcae73f87ff28ff" +dependencies = [ + "version_check", +] + +[[package]] +name = "stdweb" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5" +dependencies = [ + "discard", + "rustc_version", + "stdweb-derive", + "stdweb-internal-macros", + "stdweb-internal-runtime", + "wasm-bindgen", +] + +[[package]] +name = "stdweb-derive" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c87a60a40fccc84bef0652345bbbbbe20a605bf5d0ce81719fc476f5c03b50ef" +dependencies = [ + "proc-macro2", + "quote", + "serde", + "serde_derive", + "syn 1.0.109", +] + +[[package]] +name = "stdweb-internal-macros" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58fa5ff6ad0d98d1ffa8cb115892b6e69d67799f6763e162a1c9db421dc22e11" +dependencies = [ + "base-x", + "proc-macro2", + "quote", + "serde", + "serde_derive", + "serde_json", + "sha1 0.6.1", + "syn 1.0.109", +] + +[[package]] +name = "stdweb-internal-runtime" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0" + [[package]] name = "stringprep" version = "0.1.5" @@ -2017,6 +2338,17 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.107" @@ -2045,7 +2377,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -2099,7 +2431,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -2111,6 +2443,44 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "time" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4752a97f8eebd6854ff91f1c1824cd6160626ac4bd44287f7f4ea2035a02a242" +dependencies = [ + "const_fn", + "libc", + "standback", + "stdweb", + "time-macros", + "version_check", + "winapi", +] + +[[package]] +name = "time-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "957e9c6e26f12cb6d0dd7fc776bb67a706312e7299aed74c8dd5b17ebb27e2f1" +dependencies = [ + "proc-macro-hack", + "time-macros-impl", +] + +[[package]] +name = "time-macros-impl" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3c141a1b43194f3f56a1411225df8646c55781d5f26db825b3d98507eb482f" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "standback", + "syn 1.0.109", +] + [[package]] name = "tinystr" version = "0.8.1" @@ -2161,7 +2531,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -2273,7 +2643,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -2317,6 +2687,15 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "type1-encoding-parser" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3d6cc09e1a99c7e01f2afe4953789311a1c50baebbdac5b477ecf78e2e92a5b" +dependencies = [ + "pom 1.1.0", +] + [[package]] name = "typenum" version = "1.19.0" @@ -2493,7 +2872,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn", + "syn 2.0.107", "wasm-bindgen-shared", ] @@ -2528,7 +2907,7 @@ checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2580,6 +2959,12 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "weezl" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3" + [[package]] name = "whoami" version = "1.6.1" @@ -2590,6 +2975,28 @@ dependencies = [ "wasite", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.62.2" @@ -2611,7 +3018,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -2622,7 +3029,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -2947,7 +3354,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", "synstructure", ] @@ -2968,7 +3375,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] [[package]] @@ -2988,7 +3395,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", "synstructure", ] @@ -3028,5 +3435,5 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.107", ] diff --git a/rust-engine/Cargo.toml b/rust-engine/Cargo.toml index 8613870..ccbb0b2 100644 --- a/rust-engine/Cargo.toml +++ b/rust-engine/Cargo.toml @@ -23,3 +23,4 @@ tokio-util = "0.7" futures-util = "0.3" lazy_static = "1.4" bytes = "1.4" +pdf-extract = "0.6" diff --git a/rust-engine/src/file_worker.rs b/rust-engine/src/file_worker.rs index c9ec244..4316d67 100644 --- a/rust-engine/src/file_worker.rs +++ b/rust-engine/src/file_worker.rs @@ -1,9 +1,11 @@ use crate::gemini_client::{demo_text_embedding, generate_text_with_model, DEMO_EMBED_DIM}; use crate::vector; use crate::vector_db::QdrantClient; -use anyhow::Result; +use anyhow::{anyhow, Context, Result}; +use pdf_extract::extract_text; use sqlx::MySqlPool; -use tracing::{error, info}; +use std::path::PathBuf; +use tracing::{error, info, warn}; pub struct FileWorker { pool: MySqlPool, @@ -72,17 +74,33 @@ impl FileWorker { .fetch_one(&self.pool) .await?; let filename: String = row.get("filename"); - let _path: String = row.get("path"); + let path: String = row.get("path"); + + let (file_excerpt, truncated) = match extract_file_excerpt(&path).await { + Ok(res) => res, + Err(err) => { + error!(file_id, %filename, %path, error = ?err, "failed to extract text from file; continuing with filename only"); + (String::new(), false) + } + }; + if file_excerpt.is_empty() { + warn!(file_id, %filename, %path, "extracted excerpt is empty; prompts may lack context"); + } + + let excerpt_note = if truncated { + "(excerpt truncated for prompt size)" + } else { + "" + }; // Stage 1: Gemini 2.5 Flash for description - let desc = generate_text_with_model( - "gemini-2.5-flash", - &format!( - "Describe the file '{filename}' and extract all key components, keywords, and details for later vectorization. Be comprehensive and factual." - ), - ) - .await - .unwrap_or_else(|e| format!("[desc error: {}]", e)); + let desc_prompt = format!( + "You are reviewing the PDF file '{filename}'. Use the following extracted text {excerpt_note} to produce a concise, factual description and key highlights that will help downstream search and reasoning.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---", + file_excerpt + ); + let desc = generate_text_with_model("gemini-2.5-flash", &desc_prompt) + .await + .unwrap_or_else(|e| format!("[desc error: {}]", e)); sqlx::query( "UPDATE files SET description = ?, analysis_status = 'InProgress' WHERE id = ?", ) @@ -92,14 +110,13 @@ impl FileWorker { .await?; // Stage 2: Gemini 2.5 Pro for deep vector graph data - let vector_graph = generate_text_with_model( - "gemini-2.5-pro", - &format!( - "Given the file '{filename}' and its description: {desc}\nGenerate a set of vector graph data (keywords, use cases, relationships) that can be used for broad and precise search. Only include what is directly supported by the file." - ), - ) - .await - .unwrap_or_else(|e| format!("[vector error: {}]", e)); + let vector_prompt = format!( + "You are constructing vector search metadata for the PDF file '{filename}'.\nCurrent description: {desc}\nUse the extracted text {excerpt_note} below to derive precise keywords, thematic clusters, and relationships that are explicitly supported by the content. Provide richly structured bullet points grouped by themes.\n\n--- BEGIN EXCERPT ---\n{}\n--- END EXCERPT ---", + file_excerpt + ); + let vector_graph = generate_text_with_model("gemini-2.5-pro", &vector_prompt) + .await + .unwrap_or_else(|e| format!("[vector error: {}]", e)); // Stage 3: Embed and upsert to Qdrant let emb = demo_text_embedding(&vector_graph).await?; @@ -138,3 +155,72 @@ impl FileWorker { Ok(()) } } + +// Maximum number of characters from the extracted text to include in prompts. +const MAX_EXCERPT_CHARS: usize = 4000; + +async fn extract_file_excerpt(path: &str) -> Result<(String, bool)> { + let path_buf = PathBuf::from(path); + let extension = path_buf + .extension() + .and_then(|e| e.to_str()) + .map(|s| s.to_ascii_lowercase()) + .unwrap_or_default(); + + let raw_text = if extension == "pdf" { + let pdf_path = path_buf.clone(); + tokio::task::spawn_blocking(move || extract_text(&pdf_path)) + .await + .map_err(|e| anyhow!("pdf text extraction task panicked: {e}"))?? + } else { + let bytes = tokio::fs::read(&path_buf) + .await + .with_context(|| format!("reading file bytes from {path}"))?; + String::from_utf8_lossy(&bytes).into_owned() + }; + + let cleaned = raw_text.replace('\r', ""); + let condensed = collapse_whitespace(&cleaned); + let (excerpt, truncated) = truncate_to_chars(&condensed, MAX_EXCERPT_CHARS); + + Ok((excerpt, truncated)) +} + +fn truncate_to_chars(text: &str, max_chars: usize) -> (String, bool) { + if max_chars == 0 { + return (String::new(), !text.is_empty()); + } + + let mut result = String::new(); + let mut chars = text.chars(); + for _ in 0..max_chars { + match chars.next() { + Some(ch) => result.push(ch), + None => return (result, false), + } + } + + if chars.next().is_some() { + result.push('…'); + (result, true) + } else { + (result, false) + } +} + +fn collapse_whitespace(input: &str) -> String { + let mut output = String::with_capacity(input.len()); + let mut prev_was_ws = false; + for ch in input.chars() { + if ch.is_whitespace() { + if !prev_was_ws { + output.push(' '); + } + prev_was_ws = true; + } else { + prev_was_ws = false; + output.push(ch); + } + } + output.trim().to_string() +}