diff --git a/milli/src/index.rs b/milli/src/index.rs index e2ab51a1c7..c8af5aba33 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -808,7 +808,7 @@ pub(crate) mod tests { use maplit::btreemap; use tempfile::TempDir; - use crate::update::{IndexDocuments, UpdateFormat}; + use crate::update::{IndexDocuments}; use crate::Index; pub(crate) struct TempIndex { @@ -844,13 +844,12 @@ pub(crate) mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 1, "name": "kevin" }, { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -869,8 +868,12 @@ pub(crate) mod tests { // we add all the documents a second time. we are supposed to get the same // field_distribution in the end let mut wtxn = index.write_txn().unwrap(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + let content = documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "bob", "age": 20 }, + { "id": 2, "name": "bob", "age": 20 } + ]); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -887,13 +890,12 @@ pub(crate) mod tests { ); // then we update a document by removing one field and another by adding one field - let content = &br#"[ + let content = documents!([ { "id": 1, "name": "kevin", "has_dog": true }, { "id": 2, "name": "bob" } - ]"#[..]; + ]); let mut wtxn = index.write_txn().unwrap(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 8381b799c3..99bc29fe49 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,6 +1,9 @@ #[macro_use] extern crate pest_derive; +#[macro_use] +pub mod documents; + mod criterion; mod error; mod external_documents_ids; @@ -12,7 +15,6 @@ pub mod proximity; mod search; pub mod tree_level; pub mod update; -pub mod documents; use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index e7dc52a82c..3f576bbc5b 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -27,21 +27,24 @@ pub trait Distinct { #[cfg(test)] mod test { use std::collections::HashSet; + use std::io::Cursor; + use bimap::BiHashMap; use once_cell::sync::Lazy; use rand::seq::SliceRandom; use rand::Rng; use roaring::RoaringBitmap; use serde_json::{json, Value}; + use crate::documents::{DocumentsBuilder, DocumentsReader}; use crate::index::tests::TempIndex; use crate::index::Index; - use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; + use crate::update::{IndexDocumentsMethod, UpdateBuilder}; use crate::{DocumentId, FieldId, BEU32}; - static JSON: Lazy = Lazy::new(generate_json); + static JSON: Lazy> = Lazy::new(generate_json); - fn generate_json() -> Value { + fn generate_json() -> Vec { let mut rng = rand::thread_rng(); let num_docs = rng.gen_range(10..30); @@ -69,7 +72,13 @@ mod test { documents.push(doc); } - Value::Array(documents) + let mut cursor = Cursor::new(Vec::new()); + let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); + + builder.add_documents(documents).unwrap(); + builder.finish().unwrap(); + + cursor.into_inner() } /// Returns a temporary index populated with random test documents, the FieldId for the @@ -89,13 +98,14 @@ mod test { let mut addition = builder.index_documents(&mut txn, &index); addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - addition.update_format(UpdateFormat::Json); - addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap(); + let reader = crate::documents::DocumentsReader::from_reader(Cursor::new(&*JSON)).unwrap(); + addition.execute(reader, |_, _| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); - let map = (0..JSON.as_array().unwrap().len() as u32).collect(); + let documents = DocumentsReader::from_reader(Cursor::new(&*JSON)).unwrap(); + let map = (0..documents.len() as u32).collect(); txn.commit().unwrap(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 789970a8ea..0f74c1e2de 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -80,7 +80,7 @@ mod tests { use heed::EnvOpenOptions; use super::*; - use crate::update::{IndexDocuments, UpdateFormat}; + use crate::update::{IndexDocuments}; #[test] fn clear_documents() { @@ -90,13 +90,12 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 0, "name": "kevin", "age": 20 }, { "id": 1, "name": "kevina" }, { "id": 2, "name": "benoit", "country": "France" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); // Clear all documents from the database. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e18c6bbd1d..3dd0ab303b 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -548,7 +548,7 @@ mod tests { use maplit::hashset; use super::*; - use crate::update::{IndexDocuments, Settings, UpdateFormat}; + use crate::update::{IndexDocuments, Settings}; use crate::FilterCondition; #[test] @@ -559,13 +559,12 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); // delete those documents, ids are synchronous therefore 0, 1, and 2. @@ -590,13 +589,12 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "mysuperid": 0, "name": "kevin" }, { "mysuperid": 1, "name": "kevina" }, { "mysuperid": 2, "name": "benoit" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); // Delete not all of the documents but some of them. @@ -621,7 +619,7 @@ mod tests { builder.set_filterable_fields(hashset! { S("label") }); builder.execute(|_, _| ()).unwrap(); - let content = &br#"[ + let content = documents!([ {"docid":"1_4","label":"sign"}, {"docid":"1_5","label":"letter"}, {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, @@ -642,9 +640,8 @@ mod tests { {"docid":"1_58","label":"abstract,art,cartoon"}, {"docid":"1_68","label":"design"}, {"docid":"1_69","label":"geometry"} - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); // Delete not all of the documents but some of them. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b948b0111c..cb9e7785da 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -822,9 +822,11 @@ mod tests { use std::io::Cursor; use big_s::S; + use bimap::BiHashMap; use heed::EnvOpenOptions; use super::*; + use crate::documents::DocumentsBuilder; use crate::update::DeleteDocuments; use crate::HashMap; @@ -837,9 +839,12 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "kevina" }, + { "id": 3, "name": "benoit" } + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -851,9 +856,8 @@ mod tests { // Second we send 1 document with id 1, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,updated kevin\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ { "id": 1, "name": "updated kevin" } ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -865,9 +869,12 @@ mod tests { // Third we send 3 documents again to replace the existing ones. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ + { "id": 1, "name": "updated second kevin" }, + { "id": 2, "name": "updated kevina" }, + { "id": 3, "name": "updated benoit" } + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 2); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -888,9 +895,12 @@ mod tests { // First we send 3 documents with duplicate ids and // change the index method to merge documents. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,kevin\n1,kevina\n1,benoit\n"[..]; + let content = documents!([ + { "id": 1, "name": "kevin" }, + { "id": 1, "name": "kevina" }, + { "id": 1, "name": "benoit" } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -915,9 +925,8 @@ mod tests { // Second we send 1 document with id 1, to force it to be merged with the previous one. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,age\n1,25\n"[..]; + let content = documents!([ { "id": 1, "age": 25 } ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -937,13 +946,13 @@ mod tests { let mut doc_iter = doc.iter(); assert_eq!(doc_iter.next(), Some((0, &br#""1""#[..]))); assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..]))); - assert_eq!(doc_iter.next(), Some((2, &br#""25""#[..]))); + assert_eq!(doc_iter.next(), Some((2, &br#"25"#[..]))); assert_eq!(doc_iter.next(), None); drop(rtxn); } #[test] - fn not_auto_generated_csv_documents_ids() { + fn not_auto_generated_documents_ids() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -951,35 +960,12 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); - assert!(builder.execute(content, |_, _| ()).is_err()); - wtxn.commit().unwrap(); - - // Check that there is no document. - let rtxn = index.read_txn().unwrap(); - let count = index.number_of_documents(&rtxn).unwrap(); - assert_eq!(count, 0); - drop(rtxn); - } - - #[test] - fn not_auto_generated_json_documents_ids() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // First we send 3 documents and 2 without ids. - let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ - { "name": "kevina", "id": 21 }, + let content = documents!([ { "name": "kevin" }, + { "name": "kevina" }, { "name": "benoit" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); assert!(builder.execute(content, |_, _| ()).is_err()); wtxn.commit().unwrap(); @@ -999,10 +985,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; + let content = documents!([ + { "name": "kevin" }, + { "name": "kevina" }, + { "name": "benoit" } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1018,10 +1007,9 @@ mod tests { // Second we send 1 document with the generated uuid, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); - let content = format!("id,name\n{},updated kevin", kevin_uuid); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); - builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -1031,7 +1019,8 @@ mod tests { let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); let (kevin_id, _) = - docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); + docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#) + .unwrap(); let (id, doc) = docs[*kevin_id as usize]; assert_eq!(id, *kevin_id); @@ -1052,9 +1041,12 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "kevina" }, + { "id": 3, "name": "benoit" } + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1066,9 +1058,9 @@ mod tests { // Second we send 1 document without specifying the id. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name\nnew kevin"[..]; + let content = documents!([ { "name": "new kevin" } ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); + builder.enable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1080,7 +1072,7 @@ mod tests { } #[test] - fn empty_csv_update() { + fn empty_update() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -1088,9 +1080,8 @@ mod tests { // First we send 0 documents and only headers. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + let content = documents!([]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1101,83 +1092,6 @@ mod tests { drop(rtxn); } - #[test] - fn json_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // First we send 3 documents with an id for only one of them. - let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ - { "name": "kevin" }, - { "name": "kevina", "id": 21 }, - { "name": "benoit" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Check that there is 3 documents now. - let rtxn = index.read_txn().unwrap(); - let count = index.number_of_documents(&rtxn).unwrap(); - assert_eq!(count, 3); - drop(rtxn); - } - - #[test] - fn empty_json_update() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // First we send 0 documents. - let mut wtxn = index.write_txn().unwrap(); - let content = &b"[]"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Check that there is no documents. - let rtxn = index.read_txn().unwrap(); - let count = index.number_of_documents(&rtxn).unwrap(); - assert_eq!(count, 0); - drop(rtxn); - } - - #[test] - fn json_stream_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // First we send 3 documents with an id for only one of them. - let mut wtxn = index.write_txn().unwrap(); - let content = &br#" - { "name": "kevin" } - { "name": "kevina", "id": 21 } - { "name": "benoit" } - "#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::JsonStream); - builder.execute(content, |_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Check that there is 3 documents now. - let rtxn = index.read_txn().unwrap(); - let count = index.number_of_documents(&rtxn).unwrap(); - assert_eq!(count, 3); - drop(rtxn); - } - #[test] fn invalid_documents_ids() { let path = tempfile::tempdir().unwrap(); @@ -1188,18 +1102,16 @@ mod tests { // First we send 1 document with an invalid id. let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. - let content = &b"id,name\nbrume bleue,kevin\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); assert!(builder.execute(content, |_, _| ()).is_err()); wtxn.commit().unwrap(); // First we send 1 document with a valid id. let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. - let content = &b"id,name\n32,kevin\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ { "id": 32, "name": "kevin" } ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1211,7 +1123,7 @@ mod tests { } #[test] - fn complex_json_documents() { + fn complex_documents() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -1219,13 +1131,12 @@ mod tests { // First we send 3 documents with an id for only one of them. let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1256,33 +1167,31 @@ mod tests { // First we send 3 documents with an id for only one of them. let mut wtxn = index.write_txn().unwrap(); - let documents = &r#"[ + let documents = documents!([ { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5 }, { "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 }, { "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 }, { "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" }, { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams" } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - builder.execute(Cursor::new(documents), |_, _| ()).unwrap(); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - let documents = &r#"[ + let documents = documents!([ { "id": 2, "author": "J. Austen", "date": "1813" } - ]"#[..]; + ]); - builder.execute(Cursor::new(documents), |_, _| ()).unwrap(); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); } @@ -1294,14 +1203,13 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" }, { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" }, { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, { "objectId": 30, "title": "Hamlet" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); @@ -1314,21 +1222,19 @@ mod tests { let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_none()); - let content = &br#"[ + let content = documents!([ { "objectId": 30, "title": "Hamlet" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_some()); - let content = &br#"[ + let content = documents!([ { "objectId": 30, "title": "Hamlet" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1350,12 +1256,16 @@ mod tests { big_object.insert(key, "I am a text!"); } - let content = vec![big_object]; - let content = serde_json::to_string(&content).unwrap(); + let mut cursor = Cursor::new(Vec::new()); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); - builder.execute(Cursor::new(content), |_, _| ()).unwrap(); + let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); + builder.add_documents(big_object).unwrap(); + builder.finish().unwrap(); + cursor.set_position(0); + let content = DocumentsReader::from_reader(cursor).unwrap(); + + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); } @@ -1368,16 +1278,38 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = r#"#id,title,au{hor,genre,price$ -2,"Prideand Prejudice","Jane Austin","romance",3.5$ -456,"Le Petit Prince","Antoine de Saint-Exupéry","adventure",10.0$ -1,Wonderland","Lewis Carroll","fantasy",25.99$ -4,"Harry Potter ing","fantasy\0lood Prince","J. K. Rowling","fantasy\0, -"#; - - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); - builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + let content = documents!([ + { + "id": 2, + "title": "Prideand Prejudice", + "au{hor": "Jane Austin", + "genre": "romance", + "price$": "3.5$", + }, + { + "id": 456, + "title": "Le Petit Prince", + "au{hor": "Antoine de Saint-Exupéry", + "genre": "adventure", + "price$": "10.0$", + }, + { + "id": 1, + "title": "Wonderland", + "au{hor": "Lewis Carroll", + "genre": "fantasy", + "price$": "25.99$", + }, + { + "id": 4, + "title": "Harry Potter ing fantasy\0lood Prince", + "au{hor": "J. K. Rowling", + "genre": "fantasy\0", + }, + ]); + + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c0b5e45498..24b29f7047 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -113,6 +113,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn log_every_n(&mut self, n: usize) { + self.log_every_n = Some(n); + } + pub fn reset_searchable_fields(&mut self) { self.searchable_fields = Setting::Reset; } @@ -501,7 +505,7 @@ mod tests { use super::*; use crate::error::Error; - use crate::update::{IndexDocuments, UpdateFormat}; + use crate::update::IndexDocuments; use crate::{Criterion, FilterCondition, SearchResult}; #[test] @@ -513,9 +517,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name,age\n0,kevin,23\n1,kevina,21\n2,benoit,34\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + + let content = documents!([ + { "id": 1, "name": "kevin", "age": 23 }, + { "id": 2, "name": "kevina", "age": 21}, + { "id": 3, "name": "benoit", "age": 34 } + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -567,10 +575,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -611,10 +622,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -633,10 +647,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); // In the same transaction we change the displayed fields to be only the age. @@ -678,13 +695,12 @@ mod tests { builder.execute(|_, _| ()).unwrap(); // Then index some documents. - let content = &br#"[ - { "name": "kevin", "age": 23 }, + let content = documents!([ + { "name": "kevin", "age": 23}, { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.enable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -695,11 +711,19 @@ mod tests { assert_eq!(fields_ids, hashset! { S("age") }); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. + let fidmap = index.fields_ids_map(&rtxn).unwrap(); + println!("fidmap: {:?}", fidmap); + for document in index.all_documents(&rtxn).unwrap() { + let document = document.unwrap(); + let json = crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, document.1).unwrap(); + println!("json: {:?}", json); + + } let count = index .facet_id_f64_docids .remap_key_type::() // The faceted field id is 2u16 - .prefix_iter(&rtxn, &[0, 2, 0]) + .prefix_iter(&rtxn, &[0, 1, 0]) .unwrap() .count(); assert_eq!(count, 3); @@ -707,25 +731,23 @@ mod tests { // Index a little more documents with new and current facets values. let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ - { "name": "kevin2", "age": 23 }, + let content = documents!([ + { "name": "kevin2", "age": 23}, { "name": "kevina2", "age": 21 }, - { "name": "benoit", "age": 35 } - ]"#[..]; + { "name": "benoit", "age": 35 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); // Only count the field_id 0 and level 0 facet values. - // TODO we must support typed CSVs for numbers to be understood. let count = index .facet_id_f64_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 2, 0]) + .prefix_iter(&rtxn, &[0, 1, 0]) .unwrap() .count(); assert_eq!(count, 4); @@ -747,13 +769,12 @@ mod tests { builder.execute(|_, _| ()).unwrap(); // Then index some documents. - let content = &br#"[ - { "name": "kevin", "age": 23 }, + let content = documents!([ + { "name": "kevin", "age": 23}, { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.enable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -790,7 +811,7 @@ mod tests { builder.execute(|_, _| ()).unwrap(); // Then index some documents. - let content = &br#"[ + let content = documents!([ { "name": "kevin", "age": 23 }, { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 }, @@ -798,9 +819,8 @@ mod tests { { "name": "bertrand", "age": 34 }, { "name": "bernie", "age": 34 }, { "name": "ben", "age": 34 } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.enable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -822,10 +842,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -844,10 +867,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23, "maxim": "I love dogs" }, + { "name": "kevina", "age": 21, "maxim": "Doggos are the best" }, + { "name": "benoit", "age": 34, "maxim": "The crepes are really good" }, + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); // In the same transaction we provide some stop_words @@ -915,10 +941,13 @@ mod tests { // Send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23, "maxim": "I love dogs"}, + { "name": "kevina", "age": 21, "maxim": "Doggos are the best"}, + { "name": "benoit", "age": 34, "maxim": "The crepes are really good"}, + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); // In the same transaction provide some synonyms @@ -1038,7 +1067,7 @@ mod tests { assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); // Then index some documents with the "mykey" primary key. - let content = &br#"[ + let content = documents!([ { "mykey": 1, "name": "kevin", "age": 23 }, { "mykey": 2, "name": "kevina", "age": 21 }, { "mykey": 3, "name": "benoit", "age": 34 }, @@ -1046,9 +1075,8 @@ mod tests { { "mykey": 5, "name": "bertrand", "age": 34 }, { "mykey": 6, "name": "bernie", "age": 34 }, { "mykey": 7, "name": "ben", "age": 34 } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.disable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1087,7 +1115,7 @@ mod tests { builder.set_filterable_fields(hashset! { S("genres") }); builder.execute(|_, _| ()).unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 11, "title": "Star Wars", @@ -1105,9 +1133,8 @@ mod tests { "poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg", "release_date": 819676800 } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 7d4043ff18..604ae04db5 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,12 +1,14 @@ use std::cmp::Reverse; use std::collections::HashSet; +use std::io::Cursor; use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::update::{IndexDocuments, Settings, UpdateFormat}; +use milli::update::{IndexDocuments, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index}; +use milli::documents::{DocumentsBuilder, DocumentsReader}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -49,11 +51,21 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.set_searchable_fields(vec![S("title"), S("description")]); builder.execute(|_, _| ()).unwrap(); - // index documents let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::JsonStream); builder.enable_autogenerate_docids(); - builder.execute(CONTENT.as_bytes(), |_, _| ()).unwrap(); + let mut cursor = Cursor::new(Vec::new()); + let mut documents_builder = DocumentsBuilder::new(&mut cursor, bimap::BiHashMap::new()).unwrap(); + let reader = Cursor::new(CONTENT.as_bytes()); + for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { + documents_builder.add_documents(doc.unwrap()).unwrap(); + } + documents_builder.finish().unwrap(); + + cursor.set_position(0); + + // index documents + let content = DocumentsReader::from_reader(cursor).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap();