diff --git a/.gitignore b/.gitignore index edd3e675c..3d9534d5c 100644 --- a/.gitignore +++ b/.gitignore @@ -16,5 +16,5 @@ milli/target/ ## ... unreviewed *.snap.new -# Fuzzcheck data for the facet indexing fuzz test -milli/fuzz/update::facet::incremental::fuzz::fuzz/ +# Fuzzcheck data +milli/fuzz/* \ No newline at end of file diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index d10136ace..34b00870e 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -38,6 +38,7 @@ //! field = _geoRadius(12, 13, 14) //! ``` //! +#![feature(anonymous_lifetime_in_impl_trait)] mod condition; mod error; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f5277a8fa..a010565b7 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,7 +57,7 @@ md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } [target.'cfg(fuzzing)'.dev-dependencies] -fuzzcheck = "0.12.1" +fuzzcheck = { path = "/Users/meilisearch/Documents/fuzzcheck-rs/fuzzcheck" } [features] default = [ "charabia/default" ] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 865195df5..4e665857b 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,4 +1,4 @@ -#![cfg_attr(all(test, fuzzing), feature(no_coverage))] +#![cfg_attr(all(test, fuzzing), feature(no_coverage, once_cell))] #[macro_use] pub mod documents; diff --git a/milli/src/update/fuzz.rs b/milli/src/update/fuzz.rs new file mode 100644 index 000000000..1b667273d --- /dev/null +++ b/milli/src/update/fuzz.rs @@ -0,0 +1,504 @@ +// Things tested by this fuzz test +// +// - A few different document identifiers only +// - Simple setting updates (searchable and filterable attributes only) +// - Document Deletion (given existing or unexisting external document ids) +// - Clear Documents +// - Batched document imports +// - Update/Replacememt of existing documents +// - Each operation with and without soft deletion +// - Empty document imports +// - No crash should ever happen + +// A small sample of what isn't tested: +// +// - The correctness of the indexing operations +// - Indexing mistakes that happen when many different documents are inserted +// - Long batches of document imports +// - Nested fields (not tested well anyway) +// - Any search result +// - Arbitrary document contents +// (instead, the components of the documents are pre-written manually) +// - Index creation / Deletion +// - Autogenerated docids +// - Indexing for geosearch +// - Documents with too many field ids or too many words in a field id +// - Anything related to the prefix databases +// - Incorrect setting updates +// - The logic that chooses between soft and hard deletion +// (the choice is instead set manually for each operation) +// - Different IndexerConfig parameters + +// Efficiency tips: +// +// - Use a RAM disk (see https://stackoverflow.com/questions/46224103/create-apfs-ram-disk-on-macos-high-sierra) +// - change the value of the TMPDIR environment variable to a folder in the RAM disk + +// Quality: +// - finds issue 2945 if any of the last two fixes are not present (within a few minutes) +// - issue 2945: https://github.com/meilisearch/meilisearch/issues/2945 +// - fix 1: https://github.com/meilisearch/milli/pull/723 +// - fix 2: https://github.com/meilisearch/milli/pull/734 +// - but doesn't detect anything wrong if this fix is not included: https://github.com/meilisearch/milli/pull/690 +// - because it doesn't cause any crash, I think +// - each fuzz test iteration is quite slow +// - for this fuzz test in particular, it is good to let it run for a few hours, or even a day + +use std::collections::HashSet; +use std::hash::Hash; +use std::iter::FromIterator; +use std::sync::LazyLock; + +use fuzzcheck::mutators::integer_within_range::U8WithinRangeMutator; +use fuzzcheck::mutators::option::OptionMutator; +use fuzzcheck::mutators::unique::UniqueMutator; +use fuzzcheck::mutators::vector::VecMutator; +use fuzzcheck::DefaultMutator; +use heed::{EnvOpenOptions, RwTxn}; +use serde::{Deserialize, Serialize}; +use tempfile::TempDir; + +use super::index_documents::validate_document_id; +use super::{ + ClearDocuments, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, +}; +use crate::{db_snap, Index, Search, SearchResult}; + +/// The list of document identifiers that we choose to test +static DOCUMENT_IDENTIFIERS: LazyLock> = LazyLock::new(|| { + let mut ids = vec![]; + for i in 0..10 { + ids.push(serde_json::json!(i)); + ids.push(serde_json::json!(format!("{i}"))); + } + ids.push(serde_json::json!("complex-ID-1_2")); + ids.push(serde_json::json!("1-2-3-4")); + ids.push(serde_json::json!("invalidsupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious")); + // ids.push(serde_json::json!("invalid.id")); + ids +}); + +/// The list of field values that we choose to test +static FIELD_VALUES: LazyLock> = LazyLock::new(|| { + let mut vals = vec![]; + for i in 0..10i32 { + vals.push(serde_json::json!(i)); + vals.push(serde_json::json!((i as f64) / 3.4)); + vals.push(serde_json::json!(111.1_f32.powi(i))); + vals.push(serde_json::json!(format!("{i}"))); + vals.push(serde_json::json!([i - 1, format!("{i}"), i + 1, format!("{}", i - 1), i - 2])); + vals.push(serde_json::json!(format!("{}", "a".repeat(i as usize)))); + } + vals.push(serde_json::json!({ "nested": ["value", { "nested": ["value", "value", "the quick brown fox jumps over the lazy dog, wow!"] }], "value": 0})); + vals.push(serde_json::json!("the quick brown fox jumps over the lazy dog, wow!")); + vals.push(serde_json::json!("the quick brown supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious fox jumps over the lazy dog")); + vals.push(serde_json::json!({ "lat": 23.0, "lon": 22.1 })); + vals.push(serde_json::json!({ "lat": 23.0, "lon": 22.1, "other": 10.0 })); + vals.push(serde_json::json!({ "lat": -23.0, "lon": -22.1 })); + vals.push(serde_json::json!({ "lat": 93.0, "lon": 22.1 })); + vals.push(serde_json::json!({ "lat": 90.0, "lon": 221.1 })); + vals +}); +/// The list of field keys that we choose to test +static FIELD_KEYS: LazyLock> = LazyLock::new(|| { + let mut keys = vec![]; + for f in ["identifier", "field1", "field2", "_geo"] { + keys.push(f.to_owned()); + for g in [ + "nested", + "value", + "nested.value", + "nested.value.nested", + "_geo", + "lat", + "lon", + "other", + ] { + let mut key = f.to_owned(); + key.push('.'); + key.push_str(g); + keys.push(key); + } + } + keys +}); +fn document_identifier(i: u8) -> serde_json::Value { + DOCUMENT_IDENTIFIERS[i as usize].clone() +} +fn document_identifier_string(i: u8) -> String { + let value = document_identifier(i); + match value { + serde_json::Value::Number(x) => format!("{x}"), + serde_json::Value::String(s) => s, + _ => panic!(), + } +} +fn field_key(i: u8) -> String { + FIELD_KEYS[i as usize].clone() +} +fn field_value(i: u8) -> serde_json::Value { + FIELD_VALUES[i as usize].clone() +} +fn document_identifier_index_mutator() -> U8WithinRangeMutator { + U8WithinRangeMutator::new(..DOCUMENT_IDENTIFIERS.len() as u8) +} +fn field_key_index_mutator() -> U8WithinRangeMutator { + U8WithinRangeMutator::new(..FIELD_KEYS.len() as u8) +} +fn field_value_index_mutator() -> U8WithinRangeMutator { + U8WithinRangeMutator::new(..FIELD_VALUES.len() as u8) +} + +#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)] +enum Operation { + SettingsUpdate(SettingsUpdate), + DocumentImport(DocumentImport), + DocumentDeletion(DocumentDeletion), + Clear, +} +#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)] +enum Method { + Update, + Replace, +} +#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)] +struct DocumentImport { + disable_soft_deletion: bool, + method: Method, + documents: DocumentImportBatch, +} +#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)] +struct SettingsUpdate { + // Adding filterable fields slows down the fuzzer a lot + // #[field_mutator(OptionMutator, VecMutator> = { + // OptionMutator::new(VecMutator::new(field_key_index_mutator(), 0..=10)) + // })] + // filterable_fields: Option>, + #[field_mutator(OptionMutator, VecMutator> = { + OptionMutator::new(VecMutator::new(field_key_index_mutator(), 0..=10)) + })] + searchable_fields: Option>, +} +#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)] +struct DocumentDeletion { + disable_soft_deletion: bool, + #[field_mutator(VecMutator = { + VecMutator::new(document_identifier_index_mutator(), 0..=10) + })] + external_document_ids: Vec, +} +#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)] +struct Document { + #[field_mutator(U8WithinRangeMutator = { document_identifier_index_mutator() })] + identifier: u8, + #[field_mutator(OptionMutator = { + OptionMutator::new(field_value_index_mutator()) + })] + field1: Option, + #[field_mutator(OptionMutator = { + OptionMutator::new(field_value_index_mutator()) + })] + field2: Option, +} +#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)] +struct DocumentImportBatch { + #[field_mutator(VecMutator = { + VecMutator::new(Document::default_mutator(), 0..=10) + })] + docs1: Vec, + #[field_mutator(VecMutator = { + VecMutator::new(Document::default_mutator(), 0..=5) + })] + docs2: Vec, +} + +fn apply_document_deletion<'i>( + wtxn: &mut RwTxn<'i, '_>, + index: &'i Index, + deletion: &DocumentDeletion, +) { + let DocumentDeletion { disable_soft_deletion, external_document_ids } = deletion; + let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); + builder.disable_soft_deletion(*disable_soft_deletion); + for id in external_document_ids { + let id = document_identifier(*id); + let id = match id { + serde_json::Value::Number(n) => format!("{n}"), + serde_json::Value::String(s) => s, + _ => panic!(), + }; + let _ = builder.delete_external_id(id.as_str()); + } + builder.execute().unwrap(); +} + +fn apply_document_import<'i>(wtxn: &mut RwTxn<'i, '_>, index: &'i Index, import: &DocumentImport) { + let DocumentImport { + disable_soft_deletion, + method, + documents: DocumentImportBatch { docs1, docs2 }, + } = import; + let indexer_config = IndexerConfig::default(); + let mut builder = IndexDocuments::new( + wtxn, + index, + &indexer_config, + IndexDocumentsConfig { + update_method: match method { + Method::Update => super::IndexDocumentsMethod::UpdateDocuments, + Method::Replace => super::IndexDocumentsMethod::ReplaceDocuments, + }, + disable_soft_deletion: *disable_soft_deletion, + autogenerate_docids: false, + ..IndexDocumentsConfig::default() + }, + |_| {}, + || false, + ) + .unwrap(); + + let make_real_docs = |docs: &Vec| { + docs.iter() + .map(|doc| { + let Document { identifier, field1, field2 } = doc; + let mut object = crate::Object::new(); + let identifier = document_identifier(*identifier); + object.insert("identifier".to_owned(), serde_json::json!(identifier)); + if let Some(field1) = field1 { + let field1 = field_value(*field1); + object.insert("field1".to_owned(), field1); + } + if let Some(field2) = field2 { + let field2 = field_value(*field2); + object.insert("field2".to_owned(), field2); + } + object + }) + .collect::>() + }; + + let docs1 = make_real_docs(docs1); + + let (new_builder, _user_error) = builder.add_documents(documents!(docs1)).unwrap(); + builder = new_builder; + + let docs2 = make_real_docs(docs2); + // println!("docs2: {docs2:?}"); + + let (new_builder, user_error) = builder.add_documents(documents!(docs2)).unwrap(); + builder = new_builder; + // println!("user_error: {user_error:?}"); + + let _ = builder.execute().unwrap(); + // println!("result: {result:?}"); +} + +fn apply_settings_update<'i>( + wtxn: &mut RwTxn<'i, '_>, + index: &'i Index, + settings: &SettingsUpdate, +) { + let SettingsUpdate { searchable_fields /* , filterable_fields */ } = settings; + let indexer_config = IndexerConfig::default(); + let mut settings = Settings::new(wtxn, index, &indexer_config); + // match filterable_fields { + // Some(fields) => { + // let fields = fields.iter().map(|f| field_key(*f)).collect(); + // settings.set_filterable_fields(fields); + // } + // None => settings.reset_filterable_fields(), + // } + match searchable_fields { + Some(fields) => { + let fields = fields.iter().map(|f| field_key(*f)).collect(); + settings.set_searchable_fields(fields); + } + None => settings.reset_searchable_fields(), + } + settings.execute(|_| {}, || false).unwrap(); +} + +fn apply_operation<'i>(wtxn: &mut RwTxn<'i, '_>, index: &'i Index, operation: &Operation) { + match operation { + Operation::SettingsUpdate(settings) => apply_settings_update(wtxn, index, settings), + Operation::DocumentImport(import) => apply_document_import(wtxn, index, import), + Operation::DocumentDeletion(deletion) => apply_document_deletion(wtxn, index, deletion), + Operation::Clear => { + let builder = ClearDocuments::new(wtxn, index); + let _result = builder.execute().unwrap(); + } + } +} + +#[derive(Debug, Clone, Default)] +struct IdsTracker { + ids: HashSet, +} +impl IdsTracker { + fn apply_operation(&mut self, operation: &Operation) { + match operation { + Operation::SettingsUpdate(_) => {} + Operation::DocumentImport(DocumentImport { + documents: DocumentImportBatch { docs1, docs2 }, + .. + }) => { + for d in docs1.iter().chain(docs2.iter()) { + let Document { identifier, .. } = d; + let id_str = document_identifier_string(*identifier); + // if let Some(id_str) = validate_document_id(&id_str) { + self.ids.insert(id_str.to_owned()); + // } + } + } + Operation::DocumentDeletion(DocumentDeletion { external_document_ids, .. }) => { + for identifier in external_document_ids.iter() { + self.ids.remove(&document_identifier_string(*identifier)); + } + } + Operation::Clear => { + self.ids.clear(); + } + } + } +} + +#[test] +fn fuzz() { + let tempdir = TempDir::new_in("/Volumes/Ramdisk").unwrap(); + + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 1000 * 1000); + + let index = { + let index = Index::new(options, tempdir.path()).unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let indexer_config = IndexerConfig::default(); + let mut settings = Settings::new(&mut wtxn, &index, &indexer_config); + settings.set_primary_key("identifier".to_owned()); + settings.execute(|_| {}, || false).unwrap(); + wtxn.commit().unwrap(); + index + }; + + let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { + let mut ids_tracker = IdsTracker::default(); + let mut wtxn = index.write_txn().unwrap(); + for operation in operations { + apply_operation(&mut wtxn, &index, operation); + ids_tracker.apply_operation(operation); + } + let mut search = Search::new(&wtxn, &index); + search.limit(1000); + let SearchResult { matching_words: _, candidates: _, documents_ids } = + search.execute().unwrap(); + let primary_key_id = index.fields_ids_map(&wtxn).unwrap().id("identifier").unwrap(); + // documents_ids.sort_unstable(); + let docs = index.documents(&wtxn, documents_ids.iter().copied()).unwrap(); + let mut all_ids = HashSet::new(); + let mut all_ids_string = HashSet::new(); + for (_docid, obkv) in docs { + let id = obkv.get(primary_key_id).unwrap(); + assert!(all_ids.insert(id)); + let value: serde_json::Value = serde_json::from_slice(id).unwrap(); + let id = match value { + serde_json::Value::Number(x) => format!("{x}"), + serde_json::Value::String(s) => s, + _ => panic!(), + }; + assert!(all_ids_string.insert(id)); + } + let all_documents_ids = index.documents_ids(&wtxn).unwrap(); + let returned_documents_ids = roaring::RoaringBitmap::from_iter(documents_ids); + assert_eq!(all_documents_ids, returned_documents_ids); + + assert_eq!(ids_tracker.ids, all_ids_string); + + wtxn.abort().unwrap(); + }) + // We use a bloom filter (through UniqueMutator) to prevent the same test input from being tested too many times + .mutator(UniqueMutator::new(VecMutator::new(Operation::default_mutator(), 0..=20), |x| x)) + .serde_serializer() + .default_sensor_and_pool() + .arguments_from_cargo_fuzzcheck() + .launch(); + assert!(!result.found_test_failure); +} + +#[test] +fn reproduce() { + let ops = r##" + [ + { + "DocumentImport":{ + "disable_soft_deletion":false, + "method":"Update", + "documents": { + "docs1":[], + "docs2":[ + { + "identifier":18, + "field1":62, + "field2":8 + }, + { + "identifier":12, + "field1":null, + "field2":22 + }, + { + "identifier":23, + "field1":null, + "field2":null + } + ] + } + } + } + ] + "##; + let tempdir = TempDir::new_in("/Volumes/Ramdisk").unwrap(); + + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 1000 * 1000); + + let (index, primary_key_id) = { + let index = Index::new(options, tempdir.path()).unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let indexer_config = IndexerConfig::default(); + let mut settings = Settings::new(&mut wtxn, &index, &indexer_config); + settings.set_primary_key("identifier".to_owned()); + settings.execute(|_| {}, || false).unwrap(); + let primary_key_id = index.fields_ids_map(&wtxn).unwrap().id("identifier").unwrap(); + wtxn.commit().unwrap(); + (index, primary_key_id) + }; + println!("primary_key_id: {primary_key_id}"); + let operations: Vec = serde_json::from_str(ops).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + for operation in operations.iter() { + println!("{operation:?}"); + apply_operation(&mut wtxn, &index, operation); + } + let search = Search::new(&wtxn, &index); + let SearchResult { matching_words: _, candidates: _, mut documents_ids } = + search.execute().unwrap(); + println!("documents_ids: {documents_ids:?}"); + let fields_ids_map = index.fields_ids_map(&wtxn).unwrap(); + println!("fields_ids_map: {fields_ids_map:?}"); + let primary_key_id = index.fields_ids_map(&wtxn).unwrap().id("identifier").unwrap(); + println!("primary key id again: {primary_key_id}"); + documents_ids.sort_unstable(); + let docs = index.documents(&wtxn, documents_ids).unwrap(); + let mut all_ids = HashSet::new(); + for (docid, obkv) in docs { + println!("docid {docid}:"); + for (k, v) in obkv.iter() { + println!("{k} : {v:?}"); + } + let id = obkv.get(primary_key_id).unwrap(); + assert!(all_ids.insert(id)); + } + + wtxn.commit().unwrap(); +} diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 2dda24172..2126b34d5 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -18,6 +18,8 @@ mod available_documents_ids; mod clear_documents; mod delete_documents; pub(crate) mod facet; +#[cfg(all(fuzzing, test))] +mod fuzz; mod index_documents; mod indexer_config; mod prefix_word_pairs;