Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'origin/main' into obkv-documents
Browse files Browse the repository at this point in the history
  • Loading branch information
MarinPostma committed Sep 3, 2021
2 parents 8537bf8 + 5cbe879 commit bc7b676
Show file tree
Hide file tree
Showing 30 changed files with 301 additions and 99 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
default: 'search_songs'

env:
HOME: "/home/runner" # The actions-rs/toolchain@v1 can fail we have no $HOME defined
BENCH_NAME: ${{ github.event.inputs.dataset_name }}

jobs:
benchmarks:
Expand Down Expand Up @@ -38,14 +38,14 @@ jobs:
id: commit_sha
- name: Set file basename with format "dataset_branch_commitSHA"
shell: bash
run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
id: file

# Run benchmarks
- name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: |
cd benchmarks
cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }}
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files
- name: Install critcmp
Expand Down
70 changes: 70 additions & 0 deletions .github/workflows/cron_benchmarks_indexing.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: Benchmarks indexing (cron)

on:
schedule:
- cron: "30 0 * * FRI" # every friday at 00:30

env:
BENCH_NAME: "indexing"

jobs:
benchmarks:
name: Run and upload benchmarks
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true

# Set variables
- name: Set current branch name
shell: bash
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
id: current_branch
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
shell: bash
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
id: normalized_current_branch
- name: Set shorter commit SHA
shell: bash
run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
id: commit_sha
- name: Set file basename with format "dataset_branch_commitSHA"
shell: bash
run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
id: file

# Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: |
cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files
- name: Install critcmp
run: cargo install critcmp
- name: Export cripcmp file
run: |
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
# Upload benchmarks
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
uses: BetaHuhn/do-spaces-action@v2
with:
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
source: ${{ steps.file.outputs.basename }}.json
out_dir: critcmp_results

# Helper
- name: 'README: compare with another benchmark'
run: |
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
echo 'How to compare this benchmark with another one?'
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
echo " - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
70 changes: 70 additions & 0 deletions .github/workflows/cron_benchmarks_search_songs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: Benchmarks search songs (cron)

on:
schedule:
- cron: "30 08 * * FRI" # every friday at 08:30

env:
BENCH_NAME: "search_songs"

jobs:
benchmarks:
name: Run and upload benchmarks
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true

# Set variables
- name: Set current branch name
shell: bash
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
id: current_branch
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
shell: bash
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
id: normalized_current_branch
- name: Set shorter commit SHA
shell: bash
run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
id: commit_sha
- name: Set file basename with format "dataset_branch_commitSHA"
shell: bash
run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
id: file

# Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: |
cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files
- name: Install critcmp
run: cargo install critcmp
- name: Export cripcmp file
run: |
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
# Upload benchmarks
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
uses: BetaHuhn/do-spaces-action@v2
with:
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
source: ${{ steps.file.outputs.basename }}.json
out_dir: critcmp_results

# Helper
- name: 'README: compare with another benchmark'
run: |
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
echo 'How to compare this benchmark with another one?'
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
echo " - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
70 changes: 70 additions & 0 deletions .github/workflows/cron_benchmarks_search_wiki.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: Benchmarks search wikipedia articles (cron)

on:
schedule:
- cron: "30 16 * * FRI" # every friday at 16:30 (it’s snacky snack-time!)

env:
BENCH_NAME: "search_wiki"

jobs:
benchmarks:
name: Run and upload benchmarks
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true

# Set variables
- name: Set current branch name
shell: bash
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
id: current_branch
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
shell: bash
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
id: normalized_current_branch
- name: Set shorter commit SHA
shell: bash
run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
id: commit_sha
- name: Set file basename with format "dataset_branch_commitSHA"
shell: bash
run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
id: file

# Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: |
cd benchmarks
cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files
- name: Install critcmp
run: cargo install critcmp
- name: Export cripcmp file
run: |
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
# Upload benchmarks
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
uses: BetaHuhn/do-spaces-action@v2
with:
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
source: ${{ steps.file.outputs.basename }}.json
out_dir: critcmp_results

# Helper
- name: 'README: compare with another benchmark'
run: |
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
echo 'How to compare this benchmark with another one?'
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
echo " - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion helpers/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "helpers"
version = "0.11.0"
version = "0.12.0"
authors = ["Clément Renault <[email protected]>"]
edition = "2018"

Expand Down
2 changes: 1 addition & 1 deletion http-ui/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "http-ui"
description = "The HTTP user interface of the milli search engine"
version = "0.11.0"
version = "0.12.0"
authors = ["Clément Renault <[email protected]>"]
edition = "2018"

Expand Down
2 changes: 1 addition & 1 deletion infos/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "infos"
version = "0.11.0"
version = "0.12.0"
authors = ["Clément Renault <[email protected]>"]
edition = "2018"

Expand Down
2 changes: 1 addition & 1 deletion milli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "milli"
version = "0.11.0"
version = "0.12.0"
authors = ["Kerollmops <[email protected]>"]
edition = "2018"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::{marker, str};
use crate::error::SerializationError;
use crate::heed_codec::RoaringBitmapCodec;
use crate::{try_split_array_at, try_split_at, Result};

pub type FacetStringLevelZeroValueCodec = StringValueCodec<RoaringBitmapCodec>;

/// A codec that encodes a string in front of a value.
Expand All @@ -22,7 +23,6 @@ where

fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (string, bytes) = decode_prefix_string(bytes)?;

C::bytes_decode(bytes).map(|item| (string, item))
}
}
Expand All @@ -35,7 +35,6 @@ where

fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
let value_bytes = C::bytes_encode(value)?;

let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len());
encode_prefix_string(string, &mut bytes).ok()?;
bytes.extend_from_slice(&value_bytes[..]);
Expand Down
2 changes: 1 addition & 1 deletion milli/src/search/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ impl<'a> Search<'a> {

// We check that we are allowed to use the sort criteria, we check
// that they are declared in the sortable fields.
let sortable_fields = self.index.sortable_fields(self.rtxn)?;
if let Some(sort_criteria) = &self.sort_criteria {
let sortable_fields = self.index.sortable_fields(self.rtxn)?;
for asc_desc in sort_criteria {
let field = asc_desc.field();
if !sortable_fields.contains(field) {
Expand Down
1 change: 1 addition & 0 deletions milli/src/update/facets.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
self
}

#[logging_timer::time("Facets::{}")]
pub fn execute(self) -> Result<()> {
self.index.set_updated_at(self.wtxn, &Utc::now())?;
// We get the faceted fields to be able to create the facet levels.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,16 @@ pub fn extract_fid_docid_facet_values<R: io::Read>(
// insert facet numbers in sorter
for number in numbers {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
let value_bytes = f64_into_bytes(number).unwrap(); // invalid float
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());

fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
}
}

// insert normalized and original facet string in sorter
for (normalized, original) in strings {
for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
key_buffer.extend_from_slice(normalized.as_bytes());
fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
try_split_array_at, GrenadParameters, MergeFn,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::proximity::extract_position;
use crate::{DocumentId, FieldId, Result};

Expand Down Expand Up @@ -36,7 +38,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>(
let mut current_document_id = None;

while let Some((key, value)) = docid_word_positions.next()? {
let (document_id_bytes, _word_bytes) = try_split_array_at(key).unwrap();
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);

let curr_document_id = *current_document_id.get_or_insert(document_id);
Expand Down
Loading

0 comments on commit bc7b676

Please sign in to comment.