Merge remote-tracking branch 'origin/main' into obkv-documents

meilisearch · Sep 3, 2021 · bc7b676 · bc7b676
2 parents 8537bf8 + 5cbe879
commit bc7b676
Show file tree

Hide file tree

Showing 30 changed files with 301 additions and 99 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -9,7 +9,7 @@ on:
         default: 'search_songs'
 
 env:
-  HOME: "/home/runner" # The actions-rs/toolchain@v1 can fail we have no $HOME defined
+  BENCH_NAME: ${{ github.event.inputs.dataset_name }}
 
 jobs:
   benchmarks:
@@ -38,14 +38,14 @@ jobs:
         id: commit_sha
       - name: Set file basename with format "dataset_branch_commitSHA"
         shell: bash
-        run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
+        run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
         id: file
 
       # Run benchmarks
-      - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
+      - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
         run: |
           cd benchmarks
-          cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }}
+          cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
 
       # Generate critcmp files
       - name: Install critcmp

diff --git a/.github/workflows/cron_benchmarks_indexing.yml b/.github/workflows/cron_benchmarks_indexing.yml
@@ -0,0 +1,70 @@
+name: Benchmarks indexing (cron)
+
+on:
+  schedule:
+    - cron: "30 0 * * FRI" # every friday at 00:30
+
+env:
+  BENCH_NAME: "indexing"
+
+jobs:
+  benchmarks:
+    name: Run and upload benchmarks
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+
+      # Set variables
+      - name: Set current branch name
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
+        id: current_branch
+      - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
+        id: normalized_current_branch
+      - name: Set shorter commit SHA
+        shell: bash
+        run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
+        id: commit_sha
+      - name: Set file basename with format "dataset_branch_commitSHA"
+        shell: bash
+        run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
+        id: file
+
+      # Run benchmarks
+      - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
+        run: |
+          cd benchmarks
+          cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
+
+      # Generate critcmp files
+      - name: Install critcmp
+        run: cargo install critcmp
+      - name: Export cripcmp file
+        run: |
+          critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
+
+      # Upload benchmarks
+      - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
+        uses: BetaHuhn/do-spaces-action@v2
+        with:
+          access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
+          secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
+          space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
+          space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
+          source: ${{ steps.file.outputs.basename }}.json
+          out_dir: critcmp_results
+
+      # Helper
+      - name: 'README: compare with another benchmark'
+        run: |
+          echo "${{ steps.file.outputs.basename }}.json has just been pushed."
+          echo 'How to compare this benchmark with another one?'
+          echo '  - Check the available files with: ./benchmarks/scripts/list.sh'
+          echo "  - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
diff --git a/.github/workflows/cron_benchmarks_search_songs.yml b/.github/workflows/cron_benchmarks_search_songs.yml
@@ -0,0 +1,70 @@
+name: Benchmarks search songs (cron)
+
+on:
+  schedule:
+    - cron: "30 08 * * FRI" # every friday at 08:30
+
+env:
+  BENCH_NAME: "search_songs"
+
+jobs:
+  benchmarks:
+    name: Run and upload benchmarks
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+
+      # Set variables
+      - name: Set current branch name
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
+        id: current_branch
+      - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
+        id: normalized_current_branch
+      - name: Set shorter commit SHA
+        shell: bash
+        run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
+        id: commit_sha
+      - name: Set file basename with format "dataset_branch_commitSHA"
+        shell: bash
+        run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
+        id: file
+
+      # Run benchmarks
+      - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
+        run: |
+          cd benchmarks
+          cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
+
+      # Generate critcmp files
+      - name: Install critcmp
+        run: cargo install critcmp
+      - name: Export cripcmp file
+        run: |
+          critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
+
+      # Upload benchmarks
+      - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
+        uses: BetaHuhn/do-spaces-action@v2
+        with:
+          access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
+          secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
+          space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
+          space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
+          source: ${{ steps.file.outputs.basename }}.json
+          out_dir: critcmp_results
+
+      # Helper
+      - name: 'README: compare with another benchmark'
+        run: |
+          echo "${{ steps.file.outputs.basename }}.json has just been pushed."
+          echo 'How to compare this benchmark with another one?'
+          echo '  - Check the available files with: ./benchmarks/scripts/list.sh'
+          echo "  - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
diff --git a/.github/workflows/cron_benchmarks_search_wiki.yml b/.github/workflows/cron_benchmarks_search_wiki.yml
@@ -0,0 +1,70 @@
+name: Benchmarks search wikipedia articles (cron)
+
+on:
+  schedule:
+    - cron: "30 16 * * FRI" # every friday at 16:30 (it’s snacky snack-time!)
+
+env:
+  BENCH_NAME: "search_wiki"
+
+jobs:
+  benchmarks:
+    name: Run and upload benchmarks
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+
+      # Set variables
+      - name: Set current branch name
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
+        id: current_branch
+      - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
+        id: normalized_current_branch
+      - name: Set shorter commit SHA
+        shell: bash
+        run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
+        id: commit_sha
+      - name: Set file basename with format "dataset_branch_commitSHA"
+        shell: bash
+        run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
+        id: file
+
+      # Run benchmarks
+      - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
+        run: |
+          cd benchmarks
+          cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
+
+      # Generate critcmp files
+      - name: Install critcmp
+        run: cargo install critcmp
+      - name: Export cripcmp file
+        run: |
+          critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
+
+      # Upload benchmarks
+      - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
+        uses: BetaHuhn/do-spaces-action@v2
+        with:
+          access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
+          secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
+          space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
+          space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
+          source: ${{ steps.file.outputs.basename }}.json
+          out_dir: critcmp_results
+
+      # Helper
+      - name: 'README: compare with another benchmark'
+        run: |
+          echo "${{ steps.file.outputs.basename }}.json has just been pushed."
+          echo 'How to compare this benchmark with another one?'
+          echo '  - Check the available files with: ./benchmarks/scripts/list.sh'
+          echo "  - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "helpers"
-version = "0.11.0"
+version = "0.12.0"
 authors = ["Clément Renault <[email protected]>"]
 edition = "2018"
 

diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "http-ui"
 description = "The HTTP user interface of the milli search engine"
-version = "0.11.0"
+version = "0.12.0"
 authors = ["Clément Renault <[email protected]>"]
 edition = "2018"
 

diff --git a/infos/Cargo.toml b/infos/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "infos"
-version = "0.11.0"
+version = "0.12.0"
 authors = ["Clément Renault <[email protected]>"]
 edition = "2018"
 

diff --git a/milli/Cargo.toml b/milli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "milli"
-version = "0.11.0"
+version = "0.12.0"
 authors = ["Kerollmops <[email protected]>"]
 edition = "2018"
 

diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs
@@ -5,6 +5,7 @@ use std::{marker, str};
 use crate::error::SerializationError;
 use crate::heed_codec::RoaringBitmapCodec;
 use crate::{try_split_array_at, try_split_at, Result};
+
 pub type FacetStringLevelZeroValueCodec = StringValueCodec<RoaringBitmapCodec>;
 
 /// A codec that encodes a string in front of a value.
@@ -22,7 +23,6 @@ where
 
     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
         let (string, bytes) = decode_prefix_string(bytes)?;
-
         C::bytes_decode(bytes).map(|item| (string, item))
     }
 }
@@ -35,7 +35,6 @@ where
 
     fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
         let value_bytes = C::bytes_encode(value)?;
-
         let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len());
         encode_prefix_string(string, &mut bytes).ok()?;
         bytes.extend_from_slice(&value_bytes[..]);

diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs
@@ -145,8 +145,8 @@ impl<'a> Search<'a> {
 
         // We check that we are allowed to use the sort criteria, we check
         // that they are declared in the sortable fields.
-        let sortable_fields = self.index.sortable_fields(self.rtxn)?;
         if let Some(sort_criteria) = &self.sort_criteria {
+            let sortable_fields = self.index.sortable_fields(self.rtxn)?;
             for asc_desc in sort_criteria {
                 let field = asc_desc.field();
                 if !sortable_fields.contains(field) {

diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs
@@ -57,6 +57,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
         self
     }
 
+    #[logging_timer::time("Facets::{}")]
     pub fn execute(self) -> Result<()> {
         self.index.set_updated_at(self.wtxn, &Utc::now())?;
         // We get the faceted fields to be able to create the facet levels.

diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -58,15 +58,16 @@ pub fn extract_fid_docid_facet_values<R: io::Read>(
                 // insert facet numbers in sorter
                 for number in numbers {
                     key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
-                    let value_bytes = f64_into_bytes(number).unwrap(); // invalid float
-                    key_buffer.extend_from_slice(&value_bytes);
-                    key_buffer.extend_from_slice(&number.to_be_bytes());
+                    if let Some(value_bytes) = f64_into_bytes(number) {
+                        key_buffer.extend_from_slice(&value_bytes);
+                        key_buffer.extend_from_slice(&number.to_be_bytes());
 
-                    fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
+                        fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
+                    }
                 }
 
                 // insert  normalized and original facet string in sorter
-                for (normalized, original) in strings {
+                for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) {
                     key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
                     key_buffer.extend_from_slice(normalized.as_bytes());
                     fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?;

diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@@ -8,6 +8,8 @@ use super::helpers::{
     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
     try_split_array_at, GrenadParameters, MergeFn,
 };
+use crate::error::SerializationError;
+use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::proximity::extract_position;
 use crate::{DocumentId, FieldId, Result};
 
@@ -36,7 +38,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>(
     let mut current_document_id = None;
 
     while let Some((key, value)) = docid_word_positions.next()? {
-        let (document_id_bytes, _word_bytes) = try_split_array_at(key).unwrap();
+        let (document_id_bytes, _word_bytes) = try_split_array_at(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
 
         let curr_document_id = *current_document_id.get_or_insert(document_id);