diff --git a/Cargo.lock b/Cargo.lock index 25abaf7b21..2b7ec0c47f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -338,9 +338,6 @@ name = "cc" version = "1.0.70" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d26a6ce4b6a484fa3edb70f7efa6fc430fd2b87285fe8b84304fd0936faa0dc0" -dependencies = [ - "jobserver", -] [[package]] name = "cedarwood" @@ -513,7 +510,7 @@ dependencies = [ "clap", "criterion-plot", "csv", - "itertools 0.10.1", + "itertools", "lazy_static", "num-traits", "oorandom", @@ -535,7 +532,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" dependencies = [ "cast", - "itertools 0.10.1", + "itertools", ] [[package]] @@ -902,25 +899,15 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0a01e0497841a3b2db4f8afa483cce65f7e96a3498bd6c541734792aeac8fe7" -[[package]] -name = "glob" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" - [[package]] name = "grenad" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7824d499230110f4e4a8d4fd3fd4dc15c1347fce5082e4bba82eef17f43e1ed8" +checksum = "1a7a9cc43b28a20f791b17863f34a36654fdfa50be6d0a67bb18c1e34d145f18" dependencies = [ "bytemuck", "byteorder", - "flate2", - "lz4_flex", - "snap", "tempfile", - "zstd", ] [[package]] @@ -1055,7 +1042,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.12.0" +version = "0.13.1" dependencies = [ "anyhow", "byte-unit", @@ -1109,7 +1096,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.12.0" +version = "0.13.1" dependencies = [ "anyhow", "askama", @@ -1277,7 +1264,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.12.0" +version = "0.13.1" dependencies = [ "anyhow", "byte-unit", @@ -1315,15 +1302,6 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" -[[package]] -name = "itertools" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.10.1" @@ -1375,15 +1353,6 @@ dependencies = [ "regex", ] -[[package]] -name = "jobserver" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" -dependencies = [ - "libc", -] - [[package]] name = "js-sys" version = "0.3.53" @@ -1484,15 +1453,6 @@ dependencies = [ "syn 0.15.44", ] -[[package]] -name = "lz4_flex" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827b976d911b5d2e42b2ccfc7c0d2461a1414e8280436885218762fc529b3f8" -dependencies = [ - "twox-hash", -] - [[package]] name = "maplit" version = "1.0.2" @@ -1548,7 +1508,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.12.0" +version = "0.13.1" dependencies = [ "big_s", "bimap", @@ -1565,7 +1525,7 @@ dependencies = [ "grenad", "heed", "human_format", - "itertools 0.10.1", + "itertools", "levenshtein_automata", "linked-hash-map", "log", @@ -1588,7 +1548,6 @@ dependencies = [ "smallvec", "tempfile", "uuid", - "vec-utils", ] [[package]] @@ -2563,12 +2522,6 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" -[[package]] -name = "snap" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451" - [[package]] name = "socket2" version = "0.3.19" @@ -3008,16 +2961,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "twox-hash" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f559b464de2e2bdabcac6a210d12e9b5a5973c251e102c44c585c71d51bd78e" -dependencies = [ - "cfg-if 1.0.0", - "static_assertions", -] - [[package]] name = "typenum" version = "1.14.0" @@ -3123,12 +3066,6 @@ dependencies = [ "getrandom 0.2.3", ] -[[package]] -name = "vec-utils" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dac984aa016c26ef4ed7b2c30d6a1bd570fd40a078caccaf6415a2ac5d96161" - [[package]] name = "vec_map" version = "0.8.2" @@ -3396,34 +3333,3 @@ dependencies = [ "syn 1.0.75", "synstructure", ] - -[[package]] -name = "zstd" -version = "0.5.4+zstd.1.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69996ebdb1ba8b1517f61387a883857818a66c8a295f487b1ffd8fd9d2c82910" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "2.0.6+zstd.1.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98aa931fb69ecee256d44589d19754e61851ae4769bf963b385119b1cc37a49e" -dependencies = [ - "libc", - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "1.4.18+zstd.1.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1e6e8778706838f43f771d80d37787cb2fe06dafe89dd3aebaf6721b9eaec81" -dependencies = [ - "cc", - "glob", - "itertools 0.9.0", - "libc", -] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d1fdf09975..27a0895d54 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -35,7 +35,6 @@ smallstr = { version = "0.2.0", features = ["serde"] } smallvec = { version = "1.6.1", features = ["write"] } tempfile = "3.2.0" uuid = { version = "0.8.2", features = ["v4"] } -vec-utils = "0.3.0" # facet filter parser pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index cd5617c52c..76dc8915c5 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -236,11 +236,11 @@ impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { type Error = Error; fn serialize_key(&mut self, _key: &T) -> Result<(), Self::Error> { - unimplemented!() + unreachable!() } fn serialize_value(&mut self, _value: &T) -> Result<(), Self::Error> { - unimplemented!() + unreachable!() } fn end(mut self) -> Result { @@ -262,8 +262,8 @@ impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { K: Serialize, V: Serialize, { - let field_serilizer = FieldSerializer { index: &mut self.index }; - let field_id: FieldId = key.serialize(field_serilizer)?; + let field_serializer = FieldSerializer { index: &mut self.index }; + let field_id: FieldId = key.serialize(field_serializer)?; self.buffer.clear(); let mut cursor = io::Cursor::new(&mut self.buffer); @@ -294,7 +294,7 @@ impl<'a> serde::Serializer for FieldSerializer<'a> { fn serialize_str(self, ws: &str) -> Result { let field_id = match self.index.get_by_right(ws) { - Some(field) => *field, + Some(field_id) => *field_id, None => { let field_id = self.index.len() as FieldId; self.index.insert(field_id, ws.to_string()); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f428958f9a..a364a4fbcd 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -10,7 +10,6 @@ use itertools::Itertools; use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use vec_utils::VecExt; use super::helpers::{ create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn, @@ -131,7 +130,7 @@ impl Transform<'_, '_> { let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, &[u8])> = Vec::new(); while let Some((addition_index, document)) = reader.next_document_with_index()? { - let mut field_buffer_cache = field_buffer.drop_and_reuse(); + let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { documents_seen: documents_count, @@ -217,7 +216,7 @@ impl Transform<'_, '_> { }); obkv_buffer.clear(); - field_buffer = field_buffer_cache.drop_and_reuse(); + field_buffer = drop_and_reuse(field_buffer_cache); external_id_buffer.clear(); } @@ -482,6 +481,20 @@ fn validate_document_id(document_id: &str) -> Option<&str> { }) } +/// This function drops and reuses a Vec, transmuting it's contained type along the way. +/// +/// This is usefull when you want to change the lifetime ascociated with a vec of references, while +/// keeping the allocation. +/// +/// The trick here is that the call to collect will reuse the vec allocation. +fn drop_and_reuse(mut vec: Vec) -> Vec { + debug_assert_eq!(std::mem::align_of::(), std::mem::align_of::()); + debug_assert_eq!(std::mem::size_of::(), std::mem::size_of::()); + vec.clear(); + debug_assert!(vec.is_empty()); + vec.into_iter().map(|_| unreachable!()).collect() +} + #[cfg(test)] mod test { use super::*; diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index c9720d6520..2f9191c712 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,10 +1,12 @@ use std::cmp::Reverse; +use std::io::Cursor; use big_s::S; use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; -use milli::update::{Settings, UpdateBuilder, UpdateFormat}; +use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::update::{Settings, UpdateBuilder}; use milli::{AscDesc, Criterion, Index, Search, SearchResult}; use rand::Rng; use Criterion::*; @@ -386,13 +388,13 @@ fn criteria_ascdesc() { let mut builder = UpdateBuilder::new(0); builder.max_memory(10 * 1024 * 1024); // 10MiB let mut builder = builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); builder.enable_autogenerate_docids(); - let content = [ - vec![S("name,age")], - (0..ASC_DESC_CANDIDATES_THRESHOLD + 1) - .map(|_| { + let mut cursor = Cursor::new(Vec::new()); + let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + (0..ASC_DESC_CANDIDATES_THRESHOLD + 1) + .for_each(|_| { let mut rng = rand::thread_rng(); let age = rng.gen::().to_string(); @@ -403,14 +405,21 @@ fn criteria_ascdesc() { .take(10) .collect::(); - format!("{},{}", name, age) - }) - .collect::>(), - ] - .iter() - .flatten() - .join("\n"); - builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + let json = serde_json::json!({ + "name": name, + "age": age, + }); + + batch_builder.add_documents(json).unwrap(); + }); + + batch_builder.finish().unwrap(); + + cursor.set_position(0); + + let reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap();