From b7990f7a39d5b755e310632a23a1ac1ff855cf79 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Mon, 9 Oct 2017 15:56:02 -0500 Subject: [PATCH 01/18] Iterate over log entries --- Cargo.toml | 3 +- src/errors/mod.rs | 2 +- src/lib.rs | 9 +- src/serialization/interval_log.rs | 215 ++++++++++++++++++++++++ src/serialization/interval_log_tests.rs | 161 ++++++++++++++++++ src/serialization/mod.rs | 16 +- tests/data_access.rs | 1 - 7 files changed, 397 insertions(+), 10 deletions(-) create mode 100644 src/serialization/interval_log.rs create mode 100644 src/serialization/interval_log_tests.rs diff --git a/Cargo.toml b/Cargo.toml index 9d00247..d84e5cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,13 +24,14 @@ travis-ci = { repository = "jonhoo/hdrsample" } [features] bench_private = [] # for enabling nightly-only feature(test) on the main crate to allow benchmarking private code -serialization = [ "flate2" ] +serialization = [ "flate2", "nom" ] default = [ "serialization" ] [dependencies] num-traits = "0.1" byteorder = "1.0.0" flate2 = { version = "0.2.17", optional = true } +nom = { version = "^3.2.0", optional = true } [dev-dependencies] rand = "0.3.15" diff --git a/src/errors/mod.rs b/src/errors/mod.rs index 38555f4..04b0c5c 100644 --- a/src/errors/mod.rs +++ b/src/errors/mod.rs @@ -1,4 +1,4 @@ -//! Errors types used throughout this library +//! Error types used throughout this library /// Errors that can occur when creating a histogram. #[derive(Debug, Eq, PartialEq, Clone, Copy)] diff --git a/src/lib.rs b/src/lib.rs index f7d6570..b35b5c5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -94,7 +94,7 @@ //! use hdrsample::Histogram; //! let hist = Histogram::::new(2).unwrap(); //! // ... -//! println!("# of samples: {}", hist.count()); +//! println!("# of samples: {}", hist.len()); //! println!("99.9'th percentile: {}", hist.value_at_quantile(0.999)); //! ``` //! @@ -170,7 +170,6 @@ //! - `DoubleHistogram`. //! - The `Recorder` feature of HdrHistogram. //! - Value shifting ("normalization"). -//! - Timestamps and tags. //! - Textual output methods. These seem almost orthogonal to HdrSample, though it might be //! convenient if we implemented some relevant traits (CSV, JSON, and possibly simple //! `fmt::Display`). @@ -183,12 +182,16 @@ //! ergonomic as possible. #![deny(missing_docs, trivial_casts, trivial_numeric_casts, unused_extern_crates, - unused_import_braces, unused_results, variant_size_differences, warnings)] + unused_import_braces, unused_results)] // Enable feature(test) is enabled so that we can have benchmarks of private code #![cfg_attr(all(test, feature = "bench_private"), feature(test))] extern crate num_traits as num; +#[cfg(feature = "serialization")] +#[macro_use] +extern crate nom; + use std::borrow::Borrow; use std::cmp; use std::ops::{AddAssign, SubAssign}; diff --git a/src/serialization/interval_log.rs b/src/serialization/interval_log.rs new file mode 100644 index 0000000..d0500f6 --- /dev/null +++ b/src/serialization/interval_log.rs @@ -0,0 +1,215 @@ +use std::str; + +use nom::{double, line_ending, not_line_ending, IResult}; + +#[derive(PartialEq, Debug)] +pub struct IntervalLogHistogram<'a> { + tag: Option<&'a str>, + start_timestamp: f64, + duration: f64, + max_value: f64, + encoded_histogram: &'a str, +} + +impl<'a> IntervalLogHistogram<'a> { + /// Tag, if any is present. + pub fn tag(&self) -> Option<&'a str> { + self.tag + } + + /// Timestamp of the start of the interval in seconds. + /// + /// The timestamp may be absolute vs the epoch, or there may be a StartTime or BaseTime for the + /// log. + pub fn start_timestamp(&self) -> f64 { + self.start_timestamp + } + + /// Duration of the interval in seconds. + pub fn duration(&self) -> f64 { + self.duration + } + + /// Max value in the encoded histogram + /// + /// This max value is the max of the histogram divided by some scaling factor (which may be + /// 1.0). + pub fn max_value(&self) -> f64 { + self.max_value + } + + /// Base64-encoded serialized histogram. + /// + /// If you need the deserialized histogram, use a `Deserializer. + /// + /// Histograms are left in their original encoding to make parsing each log entry very cheap. + /// One usage pattern is to navigate to a certain point in the log and only deserialize a few + /// interesting histograms, so it would be inefficient to deserialize them at log parse time.. + pub fn encoded_histogram(&self) -> &'a str { + self.encoded_histogram + } +} + +#[derive(PartialEq, Debug)] +/// Represents one non-comment line in an interval log. +/// +/// One thing to note is that the way your interval timestamps work can vary. If your log was +/// written with a StartTime or BaseTime, that metadata will appear in header comments, and that +/// will be represented by the iterator providing the corresponding variants here. The presence +/// of those timestamps will affect how you should interpret the timestamps for individual +/// intervals. +pub enum LogEntry<'a> { + /// Logs may include a StartTime. If present, it represents seconds since the epoch. + StartTime(f64), + /// Logs may include a BaseTime. If present, it represents seconds since the epoch. + BaseTime(f64), + /// An individual interval histogram. + Interval(IntervalLogHistogram<'a>), +} + +#[derive(Debug, PartialEq)] +pub enum LogIteratorError { + ParseError { offset: usize }, +} + +/// Parse interval logs. +/// +/// Interval logs, as handled by the Java implementation's `HistogramLogWriter`, +/// `HistogramLogReader`, and `HistogramLogProcessor`, are a way to record a sequence of histograms +/// over time. Suppose you were running a load test for an hour: you might want to record a +/// histogram per second or minute so that you could correlate measurements with behavior you might +/// see in logs, etc. +/// +/// An interval log contains some initial metadata, then a sequence of histograms, each with some +/// additional metadata (timestamps, etc). This iterator exposes each item (excluding comments and +/// other information-free lines). See `LogEntry`. +/// +/// This parses from a slice representing the complete file because it made implementation easier +/// (and also supports mmap'd files for maximum parsing speed). If parsing from a `Read` is +/// important for your use case, open an issue about it. +/// +/// # Examples +/// +/// ``` +/// use hdrsample::serialization; +/// // two newline-separated log lines: a comment, then an interval +/// let log = b"#I'm a comment\nTag=t,0.127,1.007,2.769,base64EncodedHisto\n"; +/// +/// let mut iter = serialization::IntervalLogIterator::new(&log[..]); +/// +/// match iter.next().unwrap().unwrap() { +/// serialization::LogEntry::Interval(h) => { +/// assert_eq!(0.127, h.start_timestamp()); +/// } +/// _ => panic!() +/// } +/// +/// assert_eq!(None, iter.next()); +/// ``` +pub struct IntervalLogIterator<'a> { + orig_len: usize, + input: &'a [u8], +} + +impl<'a> IntervalLogIterator<'a> { + /// Create a new iterator from the bytes of an interval log. + pub fn new(input: &'a [u8]) -> IntervalLogIterator<'a> { + IntervalLogIterator { + orig_len: input.len(), + input, + } + } +} + +impl<'a> Iterator for IntervalLogIterator<'a> { + type Item = Result, LogIteratorError>; + + fn next(&mut self) -> Option { + // Look for magic comments first otherwise they will get matched by the simple comment + // parser + loop { + if self.input.is_empty() { + return None; + } + + if let IResult::Done(rest, e) = log_entry(self.input) { + self.input = rest; + return Some(Ok(e)); + } + + // it wasn't a log entry; try parsing a comment + + let ignored_line_result = ignored_line(self.input); + match ignored_line_result { + IResult::Done(rest, _) => { + self.input = rest; + continue; + } + _ => { + return Some(Err(LogIteratorError::ParseError { + offset: self.orig_len - self.input.len(), + })); + } + } + } + } +} + +named!(start_time<&[u8], LogEntry>, + do_parse!( + tag!("#[StartTime: ") >> + n: double >> + char!(' ') >> + not_line_ending >> + line_ending >> + (LogEntry::StartTime(n)) +)); + +named!(base_time<&[u8], LogEntry>, + do_parse!( + tag!("#[BaseTime: ") >> + n: double >> + char!(' ') >> + not_line_ending >> + line_ending >> + (LogEntry::BaseTime(n)) +)); + +named!(interval_hist<&[u8], LogEntry>, + do_parse!( + tag: opt!( + map_res!(map!(pair!(tag!("Tag="), take_until_and_consume!(",")), |p| p.1), + str::from_utf8)) >> + start_timestamp: double >> + char!(',') >> + duration: double >> + char!(',') >> + max_value: double >> + char!(',') >> + encoded_histogram: map_res!(not_line_ending, str::from_utf8) >> + line_ending >> + (LogEntry::Interval(IntervalLogHistogram { + tag, + start_timestamp, + duration, + max_value, + encoded_histogram + })) + ) +); + +named!(log_entry<&[u8], LogEntry>, alt_complete!(start_time | base_time | interval_hist)); + +named!(comment_line<&[u8], ()>, + do_parse!(tag!("#") >> not_line_ending >> line_ending >> (())) +); + +named!(legend<&[u8], ()>, + do_parse!(tag!("\"StartTimestamp\"") >> not_line_ending >> line_ending >> (())) +); + +named!(ignored_line<&[u8], ()>, alt!(comment_line | legend)); + +#[path = "interval_log_tests.rs"] +#[cfg(test)] +mod interval_log_tests; diff --git a/src/serialization/interval_log_tests.rs b/src/serialization/interval_log_tests.rs new file mode 100644 index 0000000..736b013 --- /dev/null +++ b/src/serialization/interval_log_tests.rs @@ -0,0 +1,161 @@ +use super::*; + +#[test] +fn parse_start_time() { + let (rest, e) = start_time( + b"#[StartTime: 1441812279.474 (seconds since epoch), Wed Sep 09 08:24:39 PDT 2015]\nfoo", + ).unwrap(); + + let expected = LogEntry::StartTime(1441812279.474); + + assert_eq!(expected, e); + assert_eq!(b"foo", rest); +} + +#[test] +fn parse_base_time() { + let (rest, e) = base_time( + b"#[BaseTime: 1441812279.474 (seconds since epoch), Wed Sep 09 08:24:39 PDT 2015]\nfoo", + ).unwrap(); + + let expected = LogEntry::BaseTime(1441812279.474); + + assert_eq!(expected, e); + assert_eq!(b"foo", rest); +} + +#[test] +fn parse_legend() { + let input = b"\"StartTimestamp\",\"Interval_Length\",\"Interval_Max\",\ + \"Interval_Compressed_Histogram\"\nfoo"; + let (rest, _) = legend(input).unwrap(); + + assert_eq!(b"foo", rest); +} + +#[test] +fn parse_comment() { + let (rest, _) = comment_line(b"#SomeOtherComment\nfoo").unwrap(); + + assert_eq!(b"foo", rest); +} + +#[test] +fn parse_interval_hist_no_tag() { + let (rest, e) = interval_hist(b"0.127,1.007,2.769,couldBeBase64\nfoo").unwrap(); + + let expected = LogEntry::Interval(IntervalLogHistogram { + tag: None, + start_timestamp: 0.127, + duration: 1.007, + max_value: 2.769, + encoded_histogram: "couldBeBase64", + }); + + assert_eq!(expected, e); + assert_eq!(b"foo", rest); +} + +#[test] +fn parse_interval_hist_with_tag() { + let (rest, e) = interval_hist(b"Tag=t,0.127,1.007,2.769,couldBeBase64\nfoo").unwrap(); + + let expected = LogEntry::Interval(IntervalLogHistogram { + tag: Some("t"), + start_timestamp: 0.127, + duration: 1.007, + max_value: 2.769, + encoded_histogram: "couldBeBase64", + }); + + assert_eq!(expected, e); + assert_eq!(b"foo", rest); +} + +#[test] +fn iter_with_ignored_prefix() { + let mut data = Vec::new(); + data.extend_from_slice(b"#I'm a comment\n"); + data.extend_from_slice(b"\"StartTimestamp\",etc\n"); + data.extend_from_slice(b"Tag=t,0.127,1.007,2.769,couldBeBase64\n"); + data.extend_from_slice(b"#[StartTime: 1441812279.474 ...\n"); + + let entries: Vec = IntervalLogIterator::new(&data) + .map(|r| r.unwrap()) + .collect(); + + let expected0 = LogEntry::Interval(IntervalLogHistogram { + tag: Some("t"), + start_timestamp: 0.127, + duration: 1.007, + max_value: 2.769, + encoded_histogram: "couldBeBase64", + }); + + let expected1 = LogEntry::StartTime(1441812279.474); + + assert_eq!(vec![expected0, expected1], entries) +} + +#[test] +fn iter_without_ignored_prefix() { + let mut data = Vec::new(); + data.extend_from_slice(b"Tag=t,0.127,1.007,2.769,couldBeBase64\n"); + data.extend_from_slice(b"#[StartTime: 1441812279.474 ...\n"); + + let entries: Vec = IntervalLogIterator::new(&data) + .map(|r| r.unwrap()) + .collect(); + + let expected0 = LogEntry::Interval(IntervalLogHistogram { + tag: Some("t"), + start_timestamp: 0.127, + duration: 1.007, + max_value: 2.769, + encoded_histogram: "couldBeBase64", + }); + + let expected1 = LogEntry::StartTime(1441812279.474); + + assert_eq!(vec![expected0, expected1], entries) +} + +#[test] +fn iter_multiple_entrties_with_interleaved_ignored() { + let mut data = Vec::new(); + data.extend_from_slice(b"#I'm a comment\n"); + data.extend_from_slice(b"\"StartTimestamp\",etc\n"); + data.extend_from_slice(b"Tag=t,0.127,1.007,2.769,couldBeBase64\n"); + data.extend_from_slice(b"#Another comment\n"); + data.extend_from_slice(b"#[StartTime: 1441812279.474 ...\n"); + data.extend_from_slice(b"#Yet another comment\n"); + data.extend_from_slice(b"#[BaseTime: 1441812279.474 ...\n"); + data.extend_from_slice(b"#Enough with the comments\n"); + + let entries: Vec = IntervalLogIterator::new(&data) + .map(|r| r.unwrap()) + .collect(); + + let expected0 = LogEntry::Interval(IntervalLogHistogram { + tag: Some("t"), + start_timestamp: 0.127, + duration: 1.007, + max_value: 2.769, + encoded_histogram: "couldBeBase64", + }); + + let expected1 = LogEntry::StartTime(1441812279.474); + let expected2 = LogEntry::BaseTime(1441812279.474); + + assert_eq!(vec![expected0, expected1, expected2], entries) +} + +#[test] +fn iter_all_ignored_empty_iter() { + let mut data = Vec::new(); + data.extend_from_slice(b"#I'm a comment\n"); + data.extend_from_slice(b"\"StartTimestamp\",etc\n"); + data.extend_from_slice(b"#Another comment\n"); + + assert_eq!(0, IntervalLogIterator::new(&data).count()); +} diff --git a/src/serialization/mod.rs b/src/serialization/mod.rs index 8f9bfe5..574c942 100644 --- a/src/serialization/mod.rs +++ b/src/serialization/mod.rs @@ -1,4 +1,4 @@ -//! # Serialization/deserialization +//! Serialization/deserialization support. //! //! The upstream Java project has established several different types of serialization. We have //! currently implemented V2 and V2 + DEFLATE (following the names used by the Java implementation). @@ -49,6 +49,10 @@ //! `Read`. This should make it easy to use them in almost any context, as everything from i/o //! streams to `Vec` can be a `Read` or `Write`. //! +//! # Interval logs +//! +//! Parsing interval logs is supported via `IntervalLogIterator`. +//! //! ### Integration with general-purpose serialization libraries //! //! In general, serializing histograms should be straightforward: pick the serialization format @@ -70,7 +74,7 @@ //! //! ``` //! use hdrsample::Histogram; -//! use hdrsample::serialization::{Deserializer, V2Serializer}; +//! use hdrsample::serialization::V2Serializer; //! //! // part of serde, simplified //! trait Serializer { @@ -85,6 +89,7 @@ //! } //! //! // your custom wrapper +//! #[allow(dead_code)] // to muffle warnings compiling this example //! struct V2HistogramWrapper { //! histogram: Histogram //! } @@ -117,7 +122,7 @@ //! let orig_histogram = Histogram::::new(1).unwrap(); //! V2Serializer::new().serialize(&orig_histogram, &mut vec).unwrap(); //! -//! let histogram: Histogram = Deserializer::new() +//! let _histogram: Histogram = Deserializer::new() //! .deserialize(&mut vec.as_slice()).unwrap(); //! ``` //! @@ -137,7 +142,7 @@ //! // Make some histograms //! for _ in 0..num_histograms { //! let mut h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); -//! h.record_n(42, 7); +//! h.record_n(42, 7).unwrap(); //! histograms.push(h); //! } //! @@ -189,6 +194,9 @@ pub use self::v2_deflate_serializer::{V2DeflateSerializeError, V2DeflateSerializ mod deserializer; pub use self::deserializer::{DeserializeError, Deserializer}; +mod interval_log; +pub use self::interval_log::{IntervalLogIterator, LogEntry}; + const V2_COOKIE_BASE: u32 = 0x1c84_9303; const V2_COMPRESSED_COOKIE_BASE: u32 = 0x1c84_9304; diff --git a/tests/data_access.rs b/tests/data_access.rs index 6523726..3ea9e19 100644 --- a/tests/data_access.rs +++ b/tests/data_access.rs @@ -517,7 +517,6 @@ fn linear_iter_steps() { histogram += 1; histogram += 64; histogram += 128; - println!("{:?}", histogram.iter_linear(64).collect::>()); assert_eq!(histogram.iter_linear(64).count(), 4); } From 4d20592e8f32e2805ad782af0ba7e10662acd1ac Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Thu, 26 Oct 2017 09:42:21 -0500 Subject: [PATCH 02/18] Introduce a Serializer trait for serializers, and a Tag wrapper. Start work on writing logs. --- Cargo.toml | 3 +- benches/serialization.rs | 9 +- examples/cli.rs | 5 +- src/serialization/interval_log.rs | 215 --------- src/serialization/interval_log/mod.rs | 414 ++++++++++++++++++ .../tests.rs} | 82 +++- src/serialization/mod.rs | 57 ++- src/serialization/test_serialize_trait.rs | 26 -- src/serialization/tests.rs | 10 +- src/serialization/v2_deflate_serializer.rs | 12 +- src/serialization/v2_serializer.rs | 12 +- tests/histogram.rs | 1 + tests/serialization.rs | 2 +- 13 files changed, 558 insertions(+), 290 deletions(-) delete mode 100644 src/serialization/interval_log.rs create mode 100644 src/serialization/interval_log/mod.rs rename src/serialization/{interval_log_tests.rs => interval_log/tests.rs} (67%) delete mode 100644 src/serialization/test_serialize_trait.rs diff --git a/Cargo.toml b/Cargo.toml index d84e5cb..0a3ca71 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,7 @@ travis-ci = { repository = "jonhoo/hdrsample" } [features] bench_private = [] # for enabling nightly-only feature(test) on the main crate to allow benchmarking private code -serialization = [ "flate2", "nom" ] +serialization = [ "flate2", "nom", "base64" ] default = [ "serialization" ] [dependencies] @@ -32,6 +32,7 @@ num-traits = "0.1" byteorder = "1.0.0" flate2 = { version = "0.2.17", optional = true } nom = { version = "^3.2.0", optional = true } +base64 = { version = "0.7.0", optional = true } [dev-dependencies] rand = "0.3.15" diff --git a/benches/serialization.rs b/benches/serialization.rs index 15a356c..7763638 100644 --- a/benches/serialization.rs +++ b/benches/serialization.rs @@ -9,8 +9,7 @@ use hdrsample::serialization::*; use self::rand::distributions::range::Range; use self::rand::distributions::IndependentSample; use self::test::Bencher; -use std::io::{Cursor, Write}; -use std::fmt::Debug; +use std::io::Cursor; #[bench] fn serialize_tiny_dense_v2(b: &mut Bencher) { @@ -168,7 +167,7 @@ fn do_serialize_bench( digits: u8, fraction_of_counts_len: f64, ) where - S: TestOnlyHypotheticalSerializerInterface, + S: Serializer, { let mut h = Histogram::::new_with_bounds(low, high, digits).unwrap(); let random_counts = (fraction_of_counts_len * h.distinct_values() as f64) as usize; @@ -196,7 +195,7 @@ fn do_deserialize_bench( digits: u8, fraction_of_counts_len: f64, ) where - S: TestOnlyHypotheticalSerializerInterface, + S: Serializer, { let mut h = Histogram::::new_with_bounds(low, high, digits).unwrap(); let random_counts = (fraction_of_counts_len * h.distinct_values() as f64) as usize; @@ -217,5 +216,3 @@ fn do_deserialize_bench( let _: Histogram = d.deserialize(&mut cursor).unwrap(); }); } - -include!("../src/serialization/test_serialize_trait.rs"); diff --git a/examples/cli.rs b/examples/cli.rs index ca129ce..d92bc43 100644 --- a/examples/cli.rs +++ b/examples/cli.rs @@ -9,8 +9,9 @@ use std::fmt::Display; use clap::{App, Arg, SubCommand}; use hdrsample::{Histogram, RecordError}; -use hdrsample::serialization::{DeserializeError, Deserializer, V2DeflateSerializeError, - V2DeflateSerializer, V2SerializeError, V2Serializer}; +use hdrsample::serialization::{DeserializeError, Deserializer, Serializer, + V2DeflateSerializeError, V2DeflateSerializer, V2SerializeError, + V2Serializer}; fn main() { let default_max = format!("{}", u64::max_value()); diff --git a/src/serialization/interval_log.rs b/src/serialization/interval_log.rs deleted file mode 100644 index d0500f6..0000000 --- a/src/serialization/interval_log.rs +++ /dev/null @@ -1,215 +0,0 @@ -use std::str; - -use nom::{double, line_ending, not_line_ending, IResult}; - -#[derive(PartialEq, Debug)] -pub struct IntervalLogHistogram<'a> { - tag: Option<&'a str>, - start_timestamp: f64, - duration: f64, - max_value: f64, - encoded_histogram: &'a str, -} - -impl<'a> IntervalLogHistogram<'a> { - /// Tag, if any is present. - pub fn tag(&self) -> Option<&'a str> { - self.tag - } - - /// Timestamp of the start of the interval in seconds. - /// - /// The timestamp may be absolute vs the epoch, or there may be a StartTime or BaseTime for the - /// log. - pub fn start_timestamp(&self) -> f64 { - self.start_timestamp - } - - /// Duration of the interval in seconds. - pub fn duration(&self) -> f64 { - self.duration - } - - /// Max value in the encoded histogram - /// - /// This max value is the max of the histogram divided by some scaling factor (which may be - /// 1.0). - pub fn max_value(&self) -> f64 { - self.max_value - } - - /// Base64-encoded serialized histogram. - /// - /// If you need the deserialized histogram, use a `Deserializer. - /// - /// Histograms are left in their original encoding to make parsing each log entry very cheap. - /// One usage pattern is to navigate to a certain point in the log and only deserialize a few - /// interesting histograms, so it would be inefficient to deserialize them at log parse time.. - pub fn encoded_histogram(&self) -> &'a str { - self.encoded_histogram - } -} - -#[derive(PartialEq, Debug)] -/// Represents one non-comment line in an interval log. -/// -/// One thing to note is that the way your interval timestamps work can vary. If your log was -/// written with a StartTime or BaseTime, that metadata will appear in header comments, and that -/// will be represented by the iterator providing the corresponding variants here. The presence -/// of those timestamps will affect how you should interpret the timestamps for individual -/// intervals. -pub enum LogEntry<'a> { - /// Logs may include a StartTime. If present, it represents seconds since the epoch. - StartTime(f64), - /// Logs may include a BaseTime. If present, it represents seconds since the epoch. - BaseTime(f64), - /// An individual interval histogram. - Interval(IntervalLogHistogram<'a>), -} - -#[derive(Debug, PartialEq)] -pub enum LogIteratorError { - ParseError { offset: usize }, -} - -/// Parse interval logs. -/// -/// Interval logs, as handled by the Java implementation's `HistogramLogWriter`, -/// `HistogramLogReader`, and `HistogramLogProcessor`, are a way to record a sequence of histograms -/// over time. Suppose you were running a load test for an hour: you might want to record a -/// histogram per second or minute so that you could correlate measurements with behavior you might -/// see in logs, etc. -/// -/// An interval log contains some initial metadata, then a sequence of histograms, each with some -/// additional metadata (timestamps, etc). This iterator exposes each item (excluding comments and -/// other information-free lines). See `LogEntry`. -/// -/// This parses from a slice representing the complete file because it made implementation easier -/// (and also supports mmap'd files for maximum parsing speed). If parsing from a `Read` is -/// important for your use case, open an issue about it. -/// -/// # Examples -/// -/// ``` -/// use hdrsample::serialization; -/// // two newline-separated log lines: a comment, then an interval -/// let log = b"#I'm a comment\nTag=t,0.127,1.007,2.769,base64EncodedHisto\n"; -/// -/// let mut iter = serialization::IntervalLogIterator::new(&log[..]); -/// -/// match iter.next().unwrap().unwrap() { -/// serialization::LogEntry::Interval(h) => { -/// assert_eq!(0.127, h.start_timestamp()); -/// } -/// _ => panic!() -/// } -/// -/// assert_eq!(None, iter.next()); -/// ``` -pub struct IntervalLogIterator<'a> { - orig_len: usize, - input: &'a [u8], -} - -impl<'a> IntervalLogIterator<'a> { - /// Create a new iterator from the bytes of an interval log. - pub fn new(input: &'a [u8]) -> IntervalLogIterator<'a> { - IntervalLogIterator { - orig_len: input.len(), - input, - } - } -} - -impl<'a> Iterator for IntervalLogIterator<'a> { - type Item = Result, LogIteratorError>; - - fn next(&mut self) -> Option { - // Look for magic comments first otherwise they will get matched by the simple comment - // parser - loop { - if self.input.is_empty() { - return None; - } - - if let IResult::Done(rest, e) = log_entry(self.input) { - self.input = rest; - return Some(Ok(e)); - } - - // it wasn't a log entry; try parsing a comment - - let ignored_line_result = ignored_line(self.input); - match ignored_line_result { - IResult::Done(rest, _) => { - self.input = rest; - continue; - } - _ => { - return Some(Err(LogIteratorError::ParseError { - offset: self.orig_len - self.input.len(), - })); - } - } - } - } -} - -named!(start_time<&[u8], LogEntry>, - do_parse!( - tag!("#[StartTime: ") >> - n: double >> - char!(' ') >> - not_line_ending >> - line_ending >> - (LogEntry::StartTime(n)) -)); - -named!(base_time<&[u8], LogEntry>, - do_parse!( - tag!("#[BaseTime: ") >> - n: double >> - char!(' ') >> - not_line_ending >> - line_ending >> - (LogEntry::BaseTime(n)) -)); - -named!(interval_hist<&[u8], LogEntry>, - do_parse!( - tag: opt!( - map_res!(map!(pair!(tag!("Tag="), take_until_and_consume!(",")), |p| p.1), - str::from_utf8)) >> - start_timestamp: double >> - char!(',') >> - duration: double >> - char!(',') >> - max_value: double >> - char!(',') >> - encoded_histogram: map_res!(not_line_ending, str::from_utf8) >> - line_ending >> - (LogEntry::Interval(IntervalLogHistogram { - tag, - start_timestamp, - duration, - max_value, - encoded_histogram - })) - ) -); - -named!(log_entry<&[u8], LogEntry>, alt_complete!(start_time | base_time | interval_hist)); - -named!(comment_line<&[u8], ()>, - do_parse!(tag!("#") >> not_line_ending >> line_ending >> (())) -); - -named!(legend<&[u8], ()>, - do_parse!(tag!("\"StartTimestamp\"") >> not_line_ending >> line_ending >> (())) -); - -named!(ignored_line<&[u8], ()>, alt!(comment_line | legend)); - -#[path = "interval_log_tests.rs"] -#[cfg(test)] -mod interval_log_tests; diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs new file mode 100644 index 0000000..e90b512 --- /dev/null +++ b/src/serialization/interval_log/mod.rs @@ -0,0 +1,414 @@ +//! Interval log parsing and writing. +//! +//! Interval logs, as handled by the Java implementation's `HistogramLogWriter`, +//! `HistogramLogReader`, and `HistogramLogProcessor`, are a way to record a sequence of histograms +//! over time. Suppose you were running a load test for an hour: you might want to record a +//! histogram per second or minute so that you could correlate measurements with behavior you might +//! see in logs, etc. +//! +//! An interval log contains some initial metadata, then a sequence of histograms, each with some +//! additional metadata (timestamps, etc). +//! +//! To parse a log, see `IntervalLogIterator`. To write a log, see `LogHeaderWriter`. + +extern crate base64; + +use std::{io, str}; +use std::fmt::Write; + +use nom::{double, line_ending, not_line_ending, IResult}; + +use super::super::{Counter, Histogram}; +use super::Serializer; + +/// The starting point for writing an interval log. +/// +/// This type only allows writing comments and headers. Once you're done writing those things, use +/// `into_log_writer()` to convert this into an `IntervalLogWriter`. +pub struct IntervalLogHeaderWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { + internal_writer: InternalLogWriter<'a, 'b, W, S>, +} + +impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogHeaderWriter<'a, 'b, W, S> { + /// Create a new log writer that writes to `writer` and serializes histograms with `serializer`. + pub fn new(writer: &'a mut W, serializer: &'b mut S) -> IntervalLogHeaderWriter<'a, 'b, W, S> { + IntervalLogHeaderWriter { + internal_writer: InternalLogWriter { + writer, + serializer, + text_buf: String::new(), + serialize_buf: Vec::new(), + }, + } + } + + /// Add a comment line. + pub fn write_comment(&mut self, s: &str) -> io::Result<()> { + self.internal_writer.write_comment(s) + } + + /// Once you're finished with headers, convert this into a log writer so you can write interval + /// histograms. + pub fn into_log_writer(self) -> IntervalLogWriter<'a, 'b, W, S> { + IntervalLogWriter { + internal_writer: self.internal_writer, + } + } +} + +/// Writes interval histograms in an interval log. +/// +/// +pub struct IntervalLogWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { + internal_writer: InternalLogWriter<'a, 'b, W, S>, +} + +impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogWriter<'a, 'b, W, S> { + /// Add a comment line. + pub fn write_comment(&mut self, s: &str) -> io::Result<()> { + self.internal_writer.write_comment(s) + } + + /// Write an interval histogram. + /// + /// `start_timestamp` is the time since the epoch in seconds. If you're using a StartTime or + /// BaseTime offset, you should instead use a delta since that time. + /// `duration` is the duration of the interval in seconds. + /// `tag` is an optional tag for this histogram. + pub fn write_histogram( + &mut self, + h: &Histogram, + start_timestamp: f64, + duration: f64, + tag: Option, + ) -> Result<(), IntervalLogWriterError> { + self.internal_writer + .write_histogram(h, start_timestamp, duration, tag) + } +} + +/// Errors that can occur while writing a log. +#[derive(Debug)] +pub enum IntervalLogWriterError { + /// Histogram serialization failed. + SerializeError(E), + /// An i/o error occurred. + IoError(io::ErrorKind), +} + +impl From for IntervalLogWriterError { + fn from(e: io::Error) -> Self { + IntervalLogWriterError::IoError(e.kind()) + } +} + +/// Write interval logs. +struct InternalLogWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { + writer: &'a mut W, + serializer: &'b mut S, + text_buf: String, + serialize_buf: Vec, +} + +impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, S> { + fn write_comment(&mut self, s: &str) -> io::Result<()> { + write!(self.writer, "#{}\n", s) + } + + fn write_histogram( + &mut self, + h: &Histogram, + start_timestamp: f64, + duration: f64, + tag: Option, + ) -> Result<(), IntervalLogWriterError> { + self.serialize_buf.clear(); + self.text_buf.clear(); + + if let Some(Tag(s)) = tag { + write!(self.text_buf, "Tag={},", &s).expect("Writes to a String can't fail"); + } + + write!( + self.writer, + "{}{},{},{},", + self.text_buf, + start_timestamp, + duration, + h.max() + )?; + + self.text_buf.clear(); + let _len = self.serializer + .serialize(h, &mut self.serialize_buf) + .map_err(|e| IntervalLogWriterError::SerializeError(e))?; + base64::encode_config_buf(&self.serialize_buf, base64::STANDARD, &mut self.text_buf); + + self.writer.write_all(self.text_buf.as_bytes())?; + self.writer.write_all(b"\n")?; + + Ok(()) + } +} + +/// A tag for an interval histogram. +/// +/// Tags are just strings that do not contain a few disallowed characters: ',', '\r', '\n', and ' '. +#[derive(Debug, PartialEq, Clone, Copy)] +pub struct Tag<'a>(&'a str); + +impl<'a> Tag<'a> { + /// Create a new Tag. + /// + /// If a disallowed character is present, this will return `None`. + pub fn new(s: &'a str) -> Option> { + if s.chars() + .any(|c| c == ',' || c == '\r' || c == '\n' || c == ' ') + { + None + } else { + Some(Tag(s)) + } + } +} + +/// An individual interval histogram parsed from an interval log. +#[derive(PartialEq, Debug)] +pub struct IntervalLogHistogram<'a> { + tag: Option>, + start_timestamp: f64, + duration: f64, + max_value: f64, + encoded_histogram: &'a str, +} + +impl<'a> IntervalLogHistogram<'a> { + /// Tag, if any is present. + pub fn tag(&self) -> Option> { + self.tag + } + + /// Timestamp of the start of the interval in seconds. + /// + /// The timestamp may be absolute vs the epoch, or there may be a StartTime or BaseTime for the + /// log. + pub fn start_timestamp(&self) -> f64 { + self.start_timestamp + } + + /// Duration of the interval in seconds. + pub fn duration(&self) -> f64 { + self.duration + } + + /// Max value in the encoded histogram + /// + /// This max value is the max of the histogram divided by some scaling factor (which may be + /// 1.0). + pub fn max_value(&self) -> f64 { + self.max_value + } + + /// Base64-encoded serialized histogram. + /// + /// If you need the deserialized histogram, use a `Deserializer. + /// + /// Histograms are left in their original encoding to make parsing each log entry very cheap. + /// One usage pattern is to navigate to a certain point in the log and only deserialize a few + /// interesting histograms, so it would be inefficient to deserialize them at log parse time. + pub fn encoded_histogram(&self) -> &'a str { + self.encoded_histogram + } +} + +#[derive(PartialEq, Debug)] +/// Represents one non-comment line in an interval log. +/// +/// One thing to note is that the way your interval timestamps work can vary. If your log was +/// written with a StartTime or BaseTime, that metadata will appear in header comments, and that +/// will be represented by the iterator providing the corresponding variants here. The presence +/// of those timestamps will affect how you should interpret the timestamps for individual +/// intervals. +pub enum LogEntry<'a> { + /// Logs may include a StartTime. If present, it represents seconds since the epoch. + StartTime(f64), + /// Logs may include a BaseTime. If present, it represents seconds since the epoch. + BaseTime(f64), + /// An individual interval histogram. + Interval(IntervalLogHistogram<'a>), +} + +/// Errors that occur when parsing an interval log. +#[derive(Debug, PartialEq)] +pub enum LogIteratorError { + /// Parsing failed + ParseError { + /// Offset in the input where the failed parse started + offset: usize, + }, +} + +/// Parse interval logs. +/// +/// This iterator exposes each item (excluding comments and other information-free lines). See +/// `LogEntry`. +/// +/// This parses from a slice representing the complete file because it made implementation easier +/// (and also supports mmap'd files for maximum parsing speed). If parsing from a `Read` is +/// important for your use case, open an issue about it. +/// +/// # Examples +/// +/// Parse a single interval from a log. +/// +/// ``` +/// use hdrsample::serialization::interval_log; +/// // two newline-separated log lines: a comment, then an interval +/// let log = b"#I'm a comment\nTag=t,0.127,1.007,2.769,base64EncodedHisto\n"; +/// +/// let mut iter = interval_log::IntervalLogIterator::new(&log[..]); +/// +/// match iter.next().unwrap().unwrap() { +/// interval_log::LogEntry::Interval(h) => { +/// assert_eq!(0.127, h.start_timestamp()); +/// } +/// _ => panic!() +/// } +/// +/// assert_eq!(None, iter.next()); +/// ``` +/// +/// Skip logs that started before 3 seconds. +/// +/// ``` +/// use hdrsample::serialization::interval_log; +/// let mut log = Vec::new(); +/// log.extend_from_slice(b"#I'm a comment\n"); +/// log.extend_from_slice(b"Tag=a,0.123,1.007,2.769,base64EncodedHisto\n"); +/// log.extend_from_slice(b"1.456,1.007,2.769,base64EncodedHisto\n"); +/// log.extend_from_slice(b"3.789,1.007,2.769,base64EncodedHisto\n"); +/// log.extend_from_slice(b"Tag=b,4.123,1.007,2.769,base64EncodedHisto\n"); +/// log.extend_from_slice(b"5.456,1.007,2.769,base64EncodedHisto\n"); +/// log.extend_from_slice(b"#Another comment\n"); +/// +/// let iter = interval_log::IntervalLogIterator::new(&log); +/// +/// let count = iter.map(|r| r.unwrap()) +/// .filter_map(|e| match e { +/// interval_log::LogEntry::Interval(ilh) => Some(ilh), +/// _ => None +/// }) +/// .filter(|ilh| ilh.start_timestamp() >= 3.0) +/// .count(); +/// +/// assert_eq!(3, count); +/// ``` +pub struct IntervalLogIterator<'a> { + orig_len: usize, + input: &'a [u8], +} + +impl<'a> IntervalLogIterator<'a> { + /// Create a new iterator from the UTF-8 bytes of an interval log. + pub fn new(input: &'a [u8]) -> IntervalLogIterator<'a> { + IntervalLogIterator { + orig_len: input.len(), + input, + } + } +} + +impl<'a> Iterator for IntervalLogIterator<'a> { + type Item = Result, LogIteratorError>; + + fn next(&mut self) -> Option { + // Look for magic comments first otherwise they will get matched by the simple comment + // parser + loop { + if self.input.is_empty() { + return None; + } + + if let IResult::Done(rest, e) = log_entry(self.input) { + self.input = rest; + return Some(Ok(e)); + } + + // it wasn't a log entry; try parsing a comment + + let ignored_line_result = ignored_line(self.input); + match ignored_line_result { + IResult::Done(rest, _) => { + self.input = rest; + continue; + } + _ => { + return Some(Err(LogIteratorError::ParseError { + offset: self.orig_len - self.input.len(), + })); + } + } + } + } +} + +named!(start_time<&[u8], LogEntry>, + do_parse!( + tag!("#[StartTime: ") >> + n: double >> + char!(' ') >> + not_line_ending >> + line_ending >> + (LogEntry::StartTime(n)) +)); + +named!(base_time<&[u8], LogEntry>, + do_parse!( + tag!("#[BaseTime: ") >> + n: double >> + char!(' ') >> + not_line_ending >> + line_ending >> + (LogEntry::BaseTime(n)) +)); + +named!(interval_hist<&[u8], LogEntry>, + do_parse!( + tag: opt!( + map!( + map_res!( + map!(pair!(tag!("Tag="), take_until_and_consume!(",")), |p| p.1), + str::from_utf8), + |s| Tag(s))) >> + start_timestamp: double >> + char!(',') >> + duration: double >> + char!(',') >> + max_value: double >> + char!(',') >> + encoded_histogram: map_res!(not_line_ending, str::from_utf8) >> + line_ending >> + (LogEntry::Interval(IntervalLogHistogram { + tag, + start_timestamp, + duration, + max_value, + encoded_histogram + })) + ) +); + +named!(log_entry<&[u8], LogEntry>, alt_complete!(start_time | base_time | interval_hist)); + +named!(comment_line<&[u8], ()>, + do_parse!(tag!("#") >> not_line_ending >> line_ending >> (())) +); + +named!(legend<&[u8], ()>, + do_parse!(tag!("\"StartTimestamp\"") >> not_line_ending >> line_ending >> (())) +); + +named!(ignored_line<&[u8], ()>, alt!(comment_line | legend)); + +#[cfg(test)] +mod tests; diff --git a/src/serialization/interval_log_tests.rs b/src/serialization/interval_log/tests.rs similarity index 67% rename from src/serialization/interval_log_tests.rs rename to src/serialization/interval_log/tests.rs index 736b013..7282d62 100644 --- a/src/serialization/interval_log_tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -1,5 +1,79 @@ +use super::super::super::*; +use super::super::*; use super::*; +#[test] +fn write_header_comment() { + let mut buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + { + let mut header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); + + header_writer.write_comment("foo").unwrap(); + } + + assert_eq!(&b"#foo\n"[..], &buf[..]); +} + +#[test] +fn write_header_then_interval_comment() { + let mut buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + { + let mut header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); + header_writer.write_comment("foo").unwrap(); + let mut log_writer = header_writer.into_log_writer(); + + log_writer.write_comment("bar").unwrap(); + } + + assert_eq!("#foo\n#bar\n", str::from_utf8(&buf[..]).unwrap()); +} + +#[test] +fn write_interval_histo_no_tag() { + let mut buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + let h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); + + { + let header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); + let mut log_writer = header_writer.into_log_writer(); + + log_writer.write_histogram(&h, 1.234, 5.678, None).unwrap(); + } + + assert_eq!( + "1.234,5.678,0,HISTEwAAAAEAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAAA=\n", + str::from_utf8(&buf[..]).unwrap() + ); +} + +#[test] +fn write_interval_histo_with_tag() { + let mut buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + let h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); + + { + let header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); + let mut log_writer = header_writer.into_log_writer(); + + log_writer + .write_histogram(&h, 1.234, 5.678, Tag::new("t")) + .unwrap(); + } + + assert_eq!( + "Tag=t,1.234,5.678,0,HISTEwAAAAEAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAAA=\n", + str::from_utf8(&buf[..]).unwrap() + ); +} + #[test] fn parse_start_time() { let (rest, e) = start_time( @@ -61,7 +135,7 @@ fn parse_interval_hist_with_tag() { let (rest, e) = interval_hist(b"Tag=t,0.127,1.007,2.769,couldBeBase64\nfoo").unwrap(); let expected = LogEntry::Interval(IntervalLogHistogram { - tag: Some("t"), + tag: Some(Tag("t")), start_timestamp: 0.127, duration: 1.007, max_value: 2.769, @@ -85,7 +159,7 @@ fn iter_with_ignored_prefix() { .collect(); let expected0 = LogEntry::Interval(IntervalLogHistogram { - tag: Some("t"), + tag: Some(Tag("t")), start_timestamp: 0.127, duration: 1.007, max_value: 2.769, @@ -108,7 +182,7 @@ fn iter_without_ignored_prefix() { .collect(); let expected0 = LogEntry::Interval(IntervalLogHistogram { - tag: Some("t"), + tag: Some(Tag("t")), start_timestamp: 0.127, duration: 1.007, max_value: 2.769, @@ -137,7 +211,7 @@ fn iter_multiple_entrties_with_interleaved_ignored() { .collect(); let expected0 = LogEntry::Interval(IntervalLogHistogram { - tag: Some("t"), + tag: Some(Tag("t")), start_timestamp: 0.127, duration: 1.007, max_value: 2.769, diff --git a/src/serialization/mod.rs b/src/serialization/mod.rs index 574c942..30c405e 100644 --- a/src/serialization/mod.rs +++ b/src/serialization/mod.rs @@ -74,18 +74,20 @@ //! //! ``` //! use hdrsample::Histogram; -//! use hdrsample::serialization::V2Serializer; -//! -//! // part of serde, simplified -//! trait Serializer { -//! // ... -//! fn serialize_bytes(self, value: &[u8]) -> Result<(), ()>; -//! // ... -//! } +//! use hdrsample::serialization::{Serializer, V2Serializer}; +//! +//! mod serde { +//! // part of serde, simplified +//! pub trait Serializer { +//! // ... +//! fn serialize_bytes(self, value: &[u8]) -> Result<(), ()>; +//! // ... +//! } //! -//! // also in serde -//! trait Serialize { -//! fn serialize(&self, serializer: S) -> Result<(), ()>; +//! // also in serde +//! pub trait Serialize { +//! fn serialize(&self, serializer: S) -> Result<(), ()>; +//! } //! } //! //! // your custom wrapper @@ -94,8 +96,8 @@ //! histogram: Histogram //! } //! -//! impl Serialize for V2HistogramWrapper { -//! fn serialize(&self, serializer: S) -> Result<(), ()> { +//! impl serde::Serialize for V2HistogramWrapper { +//! fn serialize(&self, serializer: S) -> Result<(), ()> { //! // Not optimal to not re-use the vec and serializer, but it'll work //! let mut vec = Vec::new(); //! // Pick the serialization format you want to use. Here, we use plain V2, but V2 + @@ -116,7 +118,7 @@ //! //! ``` //! use hdrsample::Histogram; -//! use hdrsample::serialization::{Deserializer, V2Serializer}; +//! use hdrsample::serialization::{Deserializer, Serializer, V2Serializer}; //! //! let mut vec = Vec::new(); //! let orig_histogram = Histogram::::new(1).unwrap(); @@ -131,7 +133,7 @@ //! //! ``` //! use hdrsample::Histogram; -//! use hdrsample::serialization::{Deserializer, V2Serializer}; +//! use hdrsample::serialization::{Deserializer, Serializer, V2Serializer}; //! use std::io::Cursor; //! //! // Naturally, do real error handling instead of unwrap() everywhere @@ -179,6 +181,10 @@ extern crate byteorder; extern crate flate2; +use std::{fmt, io}; + +use super::{Counter, Histogram}; + #[cfg(test)] mod tests; @@ -194,8 +200,7 @@ pub use self::v2_deflate_serializer::{V2DeflateSerializeError, V2DeflateSerializ mod deserializer; pub use self::deserializer::{DeserializeError, Deserializer}; -mod interval_log; -pub use self::interval_log::{IntervalLogIterator, LogEntry}; +pub mod interval_log; const V2_COOKIE_BASE: u32 = 0x1c84_9303; const V2_COMPRESSED_COOKIE_BASE: u32 = 0x1c84_9304; @@ -204,3 +209,21 @@ const V2_COOKIE: u32 = V2_COOKIE_BASE | 0x10; const V2_COMPRESSED_COOKIE: u32 = V2_COMPRESSED_COOKIE_BASE | 0x10; const V2_HEADER_SIZE: usize = 40; + +/// Histogram serializer. +/// +/// Different implementations serialize to different formats. +pub trait Serializer { + /// Error type returned when serialization fails. + type SerializeError: fmt::Debug; + + /// Serialize the histogram into the provided writer. + /// Returns the number of bytes written, or an error. + /// + /// Note that `Vec` is a reasonable `Write` implementation for simple usage. + fn serialize( + &mut self, + h: &Histogram, + writer: &mut W, + ) -> Result; +} diff --git a/src/serialization/test_serialize_trait.rs b/src/serialization/test_serialize_trait.rs deleted file mode 100644 index 263fb25..0000000 --- a/src/serialization/test_serialize_trait.rs +++ /dev/null @@ -1,26 +0,0 @@ -// Maybe someday there will be an obvious right answer for what serialization should look like, at -// least to the user, but for now we'll only take an easily reversible step towards that. There are -// still several ways the serializer interfaces could change to achieve better performance, so -// committing to anything right now would be premature. -trait TestOnlyHypotheticalSerializerInterface { - type SerializeError: Debug; - - fn serialize(&mut self, h: &Histogram, writer: &mut W) - -> Result; -} - -impl TestOnlyHypotheticalSerializerInterface for V2Serializer { - type SerializeError = V2SerializeError; - - fn serialize(&mut self, h: &Histogram, writer: &mut W) -> Result { - self.serialize(h, writer) - } -} - -impl TestOnlyHypotheticalSerializerInterface for V2DeflateSerializer { - type SerializeError = V2DeflateSerializeError; - - fn serialize(&mut self, h: &Histogram, writer: &mut W) -> Result { - self.serialize(h, writer) - } -} diff --git a/src/serialization/tests.rs b/src/serialization/tests.rs index 177cc6f..8298d27 100644 --- a/src/serialization/tests.rs +++ b/src/serialization/tests.rs @@ -1,7 +1,7 @@ extern crate rand; -use super::{V2DeflateSerializeError, V2DeflateSerializer, V2SerializeError, V2Serializer, - V2_COOKIE, V2_HEADER_SIZE}; +use super::{Serializer, V2DeflateSerializer, V2SerializeError, V2Serializer, V2_COOKIE, + V2_HEADER_SIZE}; use super::v2_serializer::{counts_array_max_encoded_size, encode_counts, varint_write, zig_zag_encode}; use super::deserializer::{varint_read, varint_read_slice, zig_zag_decode, Deserializer}; @@ -9,7 +9,7 @@ use super::byteorder::{BigEndian, ReadBytesExt}; use super::super::{Counter, Histogram}; use num::ToPrimitive; use super::super::tests::helpers::histo64; -use std::io::{Cursor, Write}; +use std::io::Cursor; use std::fmt::{Debug, Display}; use std::iter::once; use self::rand::{Rand, Rng}; @@ -553,7 +553,7 @@ fn do_varint_write_read_slice_roundtrip_rand(byte_length: usize) { fn do_serialize_roundtrip_random(mut serializer: S, max_count: T) where - S: TestOnlyHypotheticalSerializerInterface, + S: Serializer, T: Counter + Debug + Display + Rand + ToPrimitive + SampleRange, { let mut d = Deserializer::new(); @@ -764,5 +764,3 @@ impl Iterator for RandomVarintEncodedLengthIter { Some(value_range.ind_sample(&mut self.rng)) } } - -include!("test_serialize_trait.rs"); diff --git a/src/serialization/v2_deflate_serializer.rs b/src/serialization/v2_deflate_serializer.rs index 508de4f..beb5183 100644 --- a/src/serialization/v2_deflate_serializer.rs +++ b/src/serialization/v2_deflate_serializer.rs @@ -1,6 +1,6 @@ use super::super::Histogram; use core::counter::Counter; -use super::V2_COMPRESSED_COOKIE; +use super::{Serializer, V2_COMPRESSED_COOKIE}; use super::v2_serializer::{V2SerializeError, V2Serializer}; use super::byteorder::{BigEndian, WriteBytesExt}; use super::flate2::Compression; @@ -48,12 +48,12 @@ impl V2DeflateSerializer { v2_serializer: V2Serializer::new(), } } +} + +impl Serializer for V2DeflateSerializer { + type SerializeError = V2DeflateSerializeError; - /// Serialize the histogram into the provided writer. - /// Returns the number of bytes written, or an error. - /// - /// Note that `Vec` is a reasonable `Write` implementation for simple usage. - pub fn serialize( + fn serialize( &mut self, h: &Histogram, writer: &mut W, diff --git a/src/serialization/v2_serializer.rs b/src/serialization/v2_serializer.rs index fb61007..91bc38e 100644 --- a/src/serialization/v2_serializer.rs +++ b/src/serialization/v2_serializer.rs @@ -1,4 +1,4 @@ -use super::{V2_COOKIE, V2_HEADER_SIZE}; +use super::{Serializer, V2_COOKIE, V2_HEADER_SIZE}; use super::super::{Counter, Histogram}; use std::io::{ErrorKind, Write}; use std; @@ -38,12 +38,12 @@ impl V2Serializer { pub fn new() -> V2Serializer { V2Serializer { buf: Vec::new() } } +} + +impl Serializer for V2Serializer { + type SerializeError = V2SerializeError; - /// Serialize the histogram into the provided writer. - /// Returns the number of bytes written, or an error. - /// - /// Note that `Vec` is a reasonable `Write` implementation for simple usage. - pub fn serialize( + fn serialize( &mut self, h: &Histogram, writer: &mut W, diff --git a/tests/histogram.rs b/tests/histogram.rs index c8fcc96..bd75190 100644 --- a/tests/histogram.rs +++ b/tests/histogram.rs @@ -6,6 +6,7 @@ extern crate rand; use self::rand::Rng; use hdrsample::{Counter, Histogram, SubtractionError}; +use hdrsample::serialization::Serializer; use std::borrow::Borrow; use std::fmt; diff --git a/tests/serialization.rs b/tests/serialization.rs index 87c8d68..7f49661 100644 --- a/tests/serialization.rs +++ b/tests/serialization.rs @@ -3,7 +3,7 @@ mod tests { extern crate hdrsample; use self::hdrsample::Histogram; - use self::hdrsample::serialization::{Deserializer, V2Serializer}; + use self::hdrsample::serialization::{Deserializer, Serializer, V2Serializer}; use std::io::{BufRead, BufReader, Read}; use std::fs::File; From e32c13a470636e93167109026f563b43924128e9 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Thu, 26 Oct 2017 11:51:04 -0500 Subject: [PATCH 03/18] Move a test into serialization module so it's easier to maintain --- tests/histogram.rs | 22 ---------------------- tests/serialization.rs | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/tests/histogram.rs b/tests/histogram.rs index bd75190..42096ee 100644 --- a/tests/histogram.rs +++ b/tests/histogram.rs @@ -6,7 +6,6 @@ extern crate rand; use self::rand::Rng; use hdrsample::{Counter, Histogram, SubtractionError}; -use hdrsample::serialization::Serializer; use std::borrow::Borrow; use std::fmt; @@ -536,27 +535,6 @@ fn total_count_overflow_from_add_with_resize_saturates() { assert_eq!(u64::max_value(), h.len()); } -#[test] -#[cfg(feature = "serialization")] -fn total_count_overflow_from_deserialize_saturates() { - use hdrsample::serialization::{Deserializer, V2Serializer}; - let mut h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); - - // can't go bigger than i64 max because it will be serialized - h.record_n(1, i64::max_value() as u64).unwrap(); - h.record_n(1000, i64::max_value() as u64).unwrap(); - h.record_n(1_000_000, i64::max_value() as u64).unwrap(); - assert_eq!(u64::max_value(), h.len()); - - let mut vec = Vec::new(); - - V2Serializer::new().serialize(&h, &mut vec).unwrap(); - let deser_h: Histogram = Deserializer::new() - .deserialize(&mut vec.as_slice()) - .unwrap(); - assert_eq!(u64::max_value(), deser_h.len()); -} - #[test] fn subtract_underflow_guarded_by_per_value_count_check() { let mut h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); diff --git a/tests/serialization.rs b/tests/serialization.rs index 7f49661..3f6b8f4 100644 --- a/tests/serialization.rs +++ b/tests/serialization.rs @@ -50,6 +50,25 @@ mod tests { assert_eq!(h, deser_h); } + #[test] + fn total_count_overflow_from_deserialize_saturates() { + let mut h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); + + // can't go bigger than i64 max because it will be serialized + h.record_n(1, i64::max_value() as u64).unwrap(); + h.record_n(1000, i64::max_value() as u64).unwrap(); + h.record_n(1000_000, i64::max_value() as u64).unwrap(); + assert_eq!(u64::max_value(), h.len()); + + let mut vec = Vec::new(); + + V2Serializer::new().serialize(&h, &mut vec).unwrap(); + let deser_h: Histogram = Deserializer::new() + .deserialize(&mut vec.as_slice()) + .unwrap(); + assert_eq!(u64::max_value(), deser_h.len()); + } + fn load_histogram_from_num_per_line(path: &Path) -> Histogram { // max is Java's Long.MAX_VALUE let mut h: Histogram = From b5bd78ff10a1d94ef7a0aa95659f6eed1d6c5880 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Thu, 26 Oct 2017 21:37:29 -0500 Subject: [PATCH 04/18] High level tests of interval log reading and writing. Added a sample file from the Java impl. Added max value divisor. --- src/serialization/interval_log/mod.rs | 43 +++- src/serialization/interval_log/tests.rs | 20 +- tests/data/tagged-Log.logV2.hlog | 46 ++++ tests/interval_log.rs | 306 ++++++++++++++++++++++++ 4 files changed, 395 insertions(+), 20 deletions(-) create mode 100644 tests/data/tagged-Log.logV2.hlog create mode 100644 tests/interval_log.rs diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index e90b512..f19d6d4 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -13,7 +13,7 @@ extern crate base64; -use std::{io, str}; +use std::{io, ops, str}; use std::fmt::Write; use nom::{double, line_ending, not_line_ending, IResult}; @@ -75,15 +75,19 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogWriter<'a, 'b, W, /// BaseTime offset, you should instead use a delta since that time. /// `duration` is the duration of the interval in seconds. /// `tag` is an optional tag for this histogram. + /// `max_value_divisor` is used to scale down the max value to something that may be more human + /// readable. The max value in the log is only for human consumption, so you might prefer to + /// divide by 10^9 to turn nanoseconds into fractional seconds, for instance. pub fn write_histogram( &mut self, h: &Histogram, start_timestamp: f64, duration: f64, tag: Option, + max_value_divisor: f64, ) -> Result<(), IntervalLogWriterError> { self.internal_writer - .write_histogram(h, start_timestamp, duration, tag) + .write_histogram(h, start_timestamp, duration, tag, max_value_divisor) } } @@ -121,6 +125,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, start_timestamp: f64, duration: f64, tag: Option, + max_value_divisor: f64, ) -> Result<(), IntervalLogWriterError> { self.serialize_buf.clear(); self.text_buf.clear(); @@ -131,11 +136,11 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, write!( self.writer, - "{}{},{},{},", + "{}{:.3},{:.3},{:.3},", self.text_buf, start_timestamp, duration, - h.max() + h.max() as f64 / max_value_divisor // because the Java impl does it this way )?; self.text_buf.clear(); @@ -153,9 +158,12 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, /// A tag for an interval histogram. /// -/// Tags are just strings that do not contain a few disallowed characters: ',', '\r', '\n', and ' '. +/// Tags are just `str`s that do not contain a few disallowed characters: ',', '\r', '\n', and ' '. +/// +/// To get the wrapped `str` back out, use `as_str()` or the `Deref` implementation +/// (`&some_tag`). #[derive(Debug, PartialEq, Clone, Copy)] -pub struct Tag<'a>(&'a str); +pub struct Tag<'a>(pub &'a str); impl<'a> Tag<'a> { /// Create a new Tag. @@ -170,6 +178,19 @@ impl<'a> Tag<'a> { Some(Tag(s)) } } + + /// Returns the tag contents as a str. + pub fn as_str(&self) -> &'a str { + self.0 + } +} + +impl<'a> ops::Deref for Tag<'a> { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.as_str() + } } /// An individual interval histogram parsed from an interval log. @@ -178,7 +199,7 @@ pub struct IntervalLogHistogram<'a> { tag: Option>, start_timestamp: f64, duration: f64, - max_value: f64, + max: f64, encoded_histogram: &'a str, } @@ -205,8 +226,8 @@ impl<'a> IntervalLogHistogram<'a> { /// /// This max value is the max of the histogram divided by some scaling factor (which may be /// 1.0). - pub fn max_value(&self) -> f64 { - self.max_value + pub fn max(&self) -> f64 { + self.max } /// Base64-encoded serialized histogram. @@ -384,7 +405,7 @@ named!(interval_hist<&[u8], LogEntry>, char!(',') >> duration: double >> char!(',') >> - max_value: double >> + max: double >> char!(',') >> encoded_histogram: map_res!(not_line_ending, str::from_utf8) >> line_ending >> @@ -392,7 +413,7 @@ named!(interval_hist<&[u8], LogEntry>, tag, start_timestamp, duration, - max_value, + max, encoded_histogram })) ) diff --git a/src/serialization/interval_log/tests.rs b/src/serialization/interval_log/tests.rs index 7282d62..6d2be6c 100644 --- a/src/serialization/interval_log/tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -43,11 +43,13 @@ fn write_interval_histo_no_tag() { let header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); let mut log_writer = header_writer.into_log_writer(); - log_writer.write_histogram(&h, 1.234, 5.678, None).unwrap(); + log_writer + .write_histogram(&h, 1.2345678, 5.67, None, 1.0) + .unwrap(); } assert_eq!( - "1.234,5.678,0,HISTEwAAAAEAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAAA=\n", + "1.235,5.670,0.000,HISTEwAAAAEAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAAA=\n", str::from_utf8(&buf[..]).unwrap() ); } @@ -64,12 +66,12 @@ fn write_interval_histo_with_tag() { let mut log_writer = header_writer.into_log_writer(); log_writer - .write_histogram(&h, 1.234, 5.678, Tag::new("t")) + .write_histogram(&h, 1.234, 5.678, Tag::new("t"), 1.0) .unwrap(); } assert_eq!( - "Tag=t,1.234,5.678,0,HISTEwAAAAEAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAAA=\n", + "Tag=t,1.234,5.678,0.000,HISTEwAAAAEAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAAA=\n", str::from_utf8(&buf[..]).unwrap() ); } @@ -122,7 +124,7 @@ fn parse_interval_hist_no_tag() { tag: None, start_timestamp: 0.127, duration: 1.007, - max_value: 2.769, + max: 2.769, encoded_histogram: "couldBeBase64", }); @@ -138,7 +140,7 @@ fn parse_interval_hist_with_tag() { tag: Some(Tag("t")), start_timestamp: 0.127, duration: 1.007, - max_value: 2.769, + max: 2.769, encoded_histogram: "couldBeBase64", }); @@ -162,7 +164,7 @@ fn iter_with_ignored_prefix() { tag: Some(Tag("t")), start_timestamp: 0.127, duration: 1.007, - max_value: 2.769, + max: 2.769, encoded_histogram: "couldBeBase64", }); @@ -185,7 +187,7 @@ fn iter_without_ignored_prefix() { tag: Some(Tag("t")), start_timestamp: 0.127, duration: 1.007, - max_value: 2.769, + max: 2.769, encoded_histogram: "couldBeBase64", }); @@ -214,7 +216,7 @@ fn iter_multiple_entrties_with_interleaved_ignored() { tag: Some(Tag("t")), start_timestamp: 0.127, duration: 1.007, - max_value: 2.769, + max: 2.769, encoded_histogram: "couldBeBase64", }); diff --git a/tests/data/tagged-Log.logV2.hlog b/tests/data/tagged-Log.logV2.hlog new file mode 100644 index 0000000..520f41d --- /dev/null +++ b/tests/data/tagged-Log.logV2.hlog @@ -0,0 +1,46 @@ +#[Logged with jHiccup version 2.0.7-SNAPSHOT, manually edited to duplicate contents with Tag=A] +#[Histogram log format version 1.2] +#[StartTime: 1441812279.474 (seconds since epoch), Wed Sep 09 08:24:39 PDT 2015] +"StartTimestamp","Interval_Length","Interval_Max","Interval_Compressed_Histogram" +0.127,1.007,2.769,HISTFAAAAEV42pNpmSzMwMCgyAABTBDKT4GBgdnNYMcCBvsPEBEJISEuATEZMQ4uASkhIR4nrxg9v2lMaxhvMekILGZkKmcCAEf2CsI= +Tag=A,0.127,1.007,2.769,HISTFAAAAEV42pNpmSzMwMCgyAABTBDKT4GBgdnNYMcCBvsPEBEJISEuATEZMQ4uASkhIR4nrxg9v2lMaxhvMekILGZkKmcCAEf2CsI= +1.134,0.999,0.442,HISTFAAAAEJ42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPEBEWLj45FTExAT4pBSEBKa6UkAgBi1uM7xjfMMlwMDABAC0CCjM= +Tag=A,1.134,0.999,0.442,HISTFAAAAEJ42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPEBEWLj45FTExAT4pBSEBKa6UkAgBi1uM7xjfMMlwMDABAC0CCjM= +2.133,1.001,0.426,HISTFAAAAD942pNpmSzMwMAgwwABTBDKT4GBgdnNYMcCBvsPEBE+Ph4OLgk5OSkeIS4+LgEeswIDo1+MbmdYNASYAA51CSo= +Tag=A,2.133,1.001,0.426,HISTFAAAAD942pNpmSzMwMAgwwABTBDKT4GBgdnNYMcCBvsPEBE+Ph4OLgk5OSkeIS4+LgEeswIDo1+MbmdYNASYAA51CSo= +3.134,1.001,0.426,HISTFAAAAD942pNpmSzMwMAgwwABTBDKT4GBgdnNYMcCBvsPEBExPiEpITEFGTkRKSEeOR6FkCg1hTeMXvNYlHhYABQ5CTo= +Tag=A,3.134,1.001,0.426,HISTFAAAAD942pNpmSzMwMAgwwABTBDKT4GBgdnNYMcCBvsPEBExPiEpITEFGTkRKSEeOR6FkCg1hTeMXvNYlHhYABQ5CTo= +4.135,0.997,0.426,HISTFAAAAD942pNpmSzMwMAgwwABTBDKT4GBgdnNYMcCBvsPEBE2PiERBREpBREhER4+Hj4uvQAdrTlMBldYDDhYAAugCKk= +Tag=A,4.135,0.997,0.426,HISTFAAAAD942pNpmSzMwMAgwwABTBDKT4GBgdnNYMcCBvsPEBE2PiERBREpBREhER4+Hj4uvQAdrTlMBldYDDhYAAugCKk= +5.132,1.002,0.426,HISTFAAAAEF42pNpmSzMwMAgywABTBDKT4GBgdnNYMcCBvsPEBEWPhElOR4pARUpKTkpGQkxq2mMegZnGI0+MZuIcAEAHo8Jvw== +Tag=A,5.132,1.002,0.426,HISTFAAAAEF42pNpmSzMwMAgywABTBDKT4GBgdnNYMcCBvsPEBEWPhElOR4pARUpKTkpGQkxq2mMegZnGI0+MZuIcAEAHo8Jvw== +6.134,0.999,0.442,HISTFAAAAEF42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPEBEWIS4FITEhDiEJERE+GT6ZkhZGLbl7jEqrWHREmFgAIbAJMw== +Tag=A,6.134,0.999,0.442,HISTFAAAAEF42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPEBEWIS4FITEhDiEJERE+GT6ZkhZGLbl7jEqrWHREmFgAIbAJMw== +7.133,0.999,0.459,HISTFAAAAEJ42pNpmSzMwMCgwAABTBDKD8hndjPYsYDB/gNEhEtMQEBBTk5MQERCRkBEQEWlh9FJbg9jE+MS5ig1LhYmADkkCcE= +Tag=A,7.133,0.999,0.459,HISTFAAAAEJ42pNpmSzMwMCgwAABTBDKD8hndjPYsYDB/gNEhEtMQEBBTk5MQERCRkBEQEWlh9FJbg9jE+MS5ig1LhYmADkkCcE= +8.132,1.000,0.459,HISTFAAAAEB42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPEBEWIREgEOIQEuGT4xHg41Oo0pIqu8LYwVImwMfGBAAfkgkw +Tag=A,8.132,1.000,0.459,HISTFAAAAEB42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPEBEWIREgEOIQEuGT4xHg41Oo0pIqu8LYwVImwMfGBAAfkgkw +9.132,1.751,1551.892,HISTFAAAAJZ42pNpmSzMwMB0nQECmCCUnwIDA7ObwY4FDPYfYDJMXFxsbGwMbBwszDwsDDxsHFw6RWJMLJMZmcqBMJrJmskSiA2ZZJmkgRBCgmheIORGI1H5rEzMQAyDzFhY2EWRWUwMWCBxQtQQhAIWJiyAaEHyFbKwsLHAADYWAWmiFeKS5gACLsIEzdQICAgBIQShEfhFABXDF+M= +Tag=A,9.132,1.751,1551.892,HISTFAAAAJZ42pNpmSzMwMB0nQECmCCUnwIDA7ObwY4FDPYfYDJMXFxsbGwMbBwszDwsDDxsHFw6RWJMLJMZmcqBMJrJmskSiA2ZZJmkgRBCgmheIORGI1H5rEzMQAyDzFhY2EWRWUwMWCBxQtQQhAIWJiyAaEHyFbKwsLHAADYWAWmiFeKS5gACLsIEzdQICAgBIQShEfhFABXDF+M= +10.883,0.250,0.426,HISTFAAAAD142pNpmSzMwMAgxQABTBDKT4GBgdnNYMcCBvsPEBEeFi4mPg4WLhY2BjY2FhYOBSkpASEtoRA+NgDkCQZR +Tag=A,10.883,0.250,0.426,HISTFAAAAD142pNpmSzMwMAgxQABTBDKT4GBgdnNYMcCBvsPEBEeFi4mPg4WLhY2BjY2FhYOBSkpASEtoRA+NgDkCQZR +11.133,1.003,0.524,HISTFAAAAER42pNpmSzMwMCgyAABTBDKT4GBgdnNYMcCBvsPUBk2HgkZKREpEQUeGSEBAQ6xSYxhCnp7GJ02sWgJsbCwMgEAO0AJSQ== +Tag=A,11.133,1.003,0.524,HISTFAAAAER42pNpmSzMwMCgyAABTBDKT4GBgdnNYMcCBvsPUBk2HgkZKREpEQUeGSEBAQ6xSYxhCnp7GJ02sWgJsbCwMgEAO0AJSQ== +12.136,0.997,0.459,HISTFAAAAEB42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPUBk2AT4eCQURHgkuEREOHjERlSQhhWuMSV9Y7ERYWAAa4gko +Tag=A,12.136,0.997,0.459,HISTFAAAAEB42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPUBk2AT4eCQURHgkuEREOHjERlSQhhWuMSV9Y7ERYWAAa4gko +13.133,0.998,0.459,HISTFAAAAD942pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPMBkRIR4RMRk5KQE+PgEhMRmzEjWZJ4whW1hMBNiYAB42CTA= +Tag=A,13.133,0.998,0.459,HISTFAAAAD942pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPMBkRIR4RMRk5KQE+PgEhMRmzEjWZJ4whW1hMBNiYAB42CTA= +14.131,1.000,0.492,HISTFAAAAEN42pNpmSzMwMCgyAABTBDKT4GBgdnNYMcCBvsPUBkWFhE5GT4FKQkRCR4ZCREpqwmMBhpHGG16WHx42JgYmAA6swk+ +Tag=A,14.131,1.000,0.492,HISTFAAAAEN42pNpmSzMwMCgyAABTBDKT4GBgdnNYMcCBvsPUBkWFhE5GT4FKQkRCR4ZCREpqwmMBhpHGG16WHx42JgYmAA6swk+ +15.131,1.001,0.442,HISTFAAAAD542pNpmSzMwMAgywABTBDKT4GBgdnNYMcCBvsPMBkuMTEFHgklFRkRATkJERGdKgudfYwRTSwGalwAF2IJOw== +Tag=A,15.131,1.001,0.442,HISTFAAAAD542pNpmSzMwMAgywABTBDKT4GBgdnNYMcCBvsPMBkuMTEFHgklFRkRATkJERGdKgudfYwRTSwGalwAF2IJOw== +16.132,1.001,0.524,HISTFAAAAEZ42pNpmSzMwMCgxAABTBDKT4GBgdnNYMcCBvsPEBE2IQEFCQkpGREpHj4hKS6NU4z7GDMkuBoYDSYw2wiwMLEyAQBQ3wne +Tag=A,16.132,1.001,0.524,HISTFAAAAEZ42pNpmSzMwMCgxAABTBDKT4GBgdnNYMcCBvsPEBE2IQEFCQkpGREpHj4hKS6NU4z7GDMkuBoYDSYw2wiwMLEyAQBQ3wne +17.133,0.998,0.459,HISTFAAAAEB42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPUBk2DjElIR4RHiExKQE5IT61iCodtXWMdn0sKVJMTAAekAk0 +Tag=A,17.133,0.998,0.459,HISTFAAAAEB42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPUBk2DjElIR4RHiExKQE5IT61iCodtXWMdn0sKVJMTAAekAk0 +18.131,1.000,0.459,HISTFAAAAEF42pNpmSzMwMAgzwABTBDKT4GBgdnNYMcCBvsPUBkWISERJSUJESklHhEJEREhqwZGLakPjDZdLBYCHCwAKOkJPg== +Tag=A,18.131,1.000,0.459,HISTFAAAAEF42pNpmSzMwMAgzwABTBDKT4GBgdnNYMcCBvsPUBkWISERJSUJESklHhEJEREhqwZGLakPjDZdLBYCHCwAKOkJPg== +19.131,1.000,0.475,HISTFAAAAEF42pNpmSzMwMAgzwABTBDKT4GBgdnNYMcCBvsPUAk2HjkJBSk+Pi4BMT4xIQE9pxIluTOMPhtYbITY2JgAKLoJOQ== +Tag=A,19.131,1.000,0.475,HISTFAAAAEF42pNpmSzMwMAgzwABTBDKT4GBgdnNYMcCBvsPUAk2HjkJBSk+Pi4BMT4xIQE9pxIluTOMPhtYbITY2JgAKLoJOQ== +20.131,1.004,0.475,HISTFAAAAEF42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPEBFmPhEJOSEhDi4+ETEeASEhswIVi1+MFjtYvCRYGJgAIP8JNw== +Tag=A,20.131,1.004,0.475,HISTFAAAAEF42pNpmSzMwMAgxwABTBDKT4GBgdnNYMcCBvsPEBFmPhEJOSEhDi4+ETEeASEhswIVi1+MFjtYvCRYGJgAIP8JNw== diff --git a/tests/interval_log.rs b/tests/interval_log.rs new file mode 100644 index 0000000..3b79ab6 --- /dev/null +++ b/tests/interval_log.rs @@ -0,0 +1,306 @@ +#[cfg(all(feature = "serialization", test))] +mod tests { + extern crate base64; + extern crate hdrsample; + extern crate rand; + + use self::rand::Rng; + + use self::hdrsample::Histogram; + use self::hdrsample::serialization::{Deserializer, Serializer, V2Serializer}; + use self::hdrsample::serialization::interval_log::{IntervalLogHeaderWriter, + IntervalLogHistogram, IntervalLogIterator, + LogEntry, LogIteratorError, Tag}; + + use std::{io, str}; + use std::io::{BufRead, Read}; + use std::fs::File; + use std::path::Path; + + #[test] + fn parse_sample_tagged_interval_log_start_timestamp() { + let data = load_iterator_from_file(Path::new("tests/data/tagged-Log.logV2.hlog")); + let start_count = data.into_iter() + .map(|r| r.unwrap()) + .filter_map(|e| match e { + LogEntry::StartTime(t) => Some(t), + _ => None, + }) + .count(); + assert_eq!(1, start_count); + } + + + #[test] + fn parse_sample_tagged_interval_log_interval_count() { + let data = load_iterator_from_file(Path::new("tests/data/tagged-Log.logV2.hlog")); + let intervals = data.into_iter() + .map(|r| r.unwrap()) + .filter_map(|e| match e { + LogEntry::Interval(ilh) => Some(ilh), + _ => None, + }) + .collect::>(); + + assert_eq!(42, intervals.len()); + + // half have tags, half do not + assert_eq!( + 21, + intervals.iter().filter(|ilh| ilh.tag().is_none()).count() + ); + assert_eq!( + 21, + intervals.iter().filter(|ilh| !ilh.tag().is_none()).count() + ); + } + + #[test] + fn parse_sample_tagged_interval_log_interval_metadata() { + let data = load_iterator_from_file(Path::new("tests/data/tagged-Log.logV2.hlog")); + let intervals = data.into_iter() + .map(|r| r.unwrap()) + .filter_map(|e| match e { + LogEntry::Interval(ilh) => Some(ilh), + _ => None, + }) + .collect::>(); + + let expected = vec![ + (0.127, 1.007, 2.769), + (1.134, 0.999, 0.442), + (2.133, 1.001, 0.426), + (3.134, 1.001, 0.426), + (4.135, 0.997, 0.426), + (5.132, 1.002, 0.426), + (6.134, 0.999, 0.442), + (7.133, 0.999, 0.459), + (8.132, 1.0, 0.459), + (9.132, 1.751, 1551.892), + (10.883, 0.25, 0.426), + (11.133, 1.003, 0.524), + (12.136, 0.997, 0.459), + (13.133, 0.998, 0.459), + (14.131, 1.0, 0.492), + (15.131, 1.001, 0.442), + (16.132, 1.001, 0.524), + (17.133, 0.998, 0.459), + (18.131, 1.0, 0.459), + (19.131, 1.0, 0.475), + (20.131, 1.004, 0.475), + ]; + + // tagged and un-tagged are identical + + assert_eq!( + expected, + intervals + .iter() + .filter(|ilh| ilh.tag().is_none()) + .map(|ilh| { (ilh.start_timestamp(), ilh.duration(), ilh.max()) }) + .collect::>() + ); + + assert_eq!( + expected, + intervals + .iter() + .filter(|ilh| !ilh.tag().is_none()) + .map(|ilh| { (ilh.start_timestamp(), ilh.duration(), ilh.max()) }) + .collect::>() + ); + + let mut deserializer = Deserializer::new(); + for ilh in intervals { + let serialized_histogram = + base64::decode_config(ilh.encoded_histogram(), base64::STANDARD).unwrap(); + let decoded_hist: Histogram = deserializer + .deserialize(&mut io::Cursor::new(&serialized_histogram)) + .unwrap(); + + // this log happened to use 1000000 as the scaling factor. It was also formatted to 3 + // decimal places. + assert_eq!(round(decoded_hist.max() as f64 / 1_000_000_f64), ilh.max()); + } + } + + #[test] + fn parse_sample_tagged_interval_log_rewrite_identical() { + // trim off the comments and legend line + let reader = + io::BufReader::new(File::open(Path::new("tests/data/tagged-Log.logV2.hlog")).unwrap()); + + // the sample log uses DEFLATE compressed histograms, which we can't match exactly, so the + // best we can do is to re-serialize each one as uncompressed. + + let mut serializer = V2Serializer::new(); + let mut deserializer = Deserializer::new(); + + let mut serialize_buf = Vec::new(); + let mut log_without_headers = Vec::new(); + reader + .lines() + .skip(4) + .map(|r| r.unwrap()) + .for_each(|mut line| { + let hist_index = line.rfind("HISTF").unwrap(); + let serialized = + base64::decode_config(&line[hist_index..], base64::STANDARD).unwrap(); + + let decoded_hist: Histogram = deserializer + .deserialize(&mut io::Cursor::new(serialized)) + .unwrap(); + + serialize_buf.clear(); + serializer + .serialize(&decoded_hist, &mut serialize_buf) + .unwrap(); + + // replace the deflate histogram with the predictable non-deflate one + line.truncate(hist_index); + line.push_str(&base64::encode_config(&serialize_buf, base64::STANDARD)); + + log_without_headers.extend_from_slice(line.as_bytes()); + log_without_headers.extend_from_slice("\n".as_bytes()); + }); + + let mut duplicate_log = Vec::new(); + + { + let mut writer = + IntervalLogHeaderWriter::new(&mut duplicate_log, &mut serializer).into_log_writer(); + + IntervalLogIterator::new(&log_without_headers) + .map(|r| r.unwrap()) + .filter_map(|e| match e { + LogEntry::Interval(ilh) => Some(ilh), + _ => None, + }) + .for_each(|ilh| { + let serialized_histogram = + base64::decode_config(ilh.encoded_histogram(), base64::STANDARD).unwrap(); + let decoded_hist: Histogram = deserializer + .deserialize(&mut io::Cursor::new(&serialized_histogram)) + .unwrap(); + + writer + .write_histogram( + &decoded_hist, + ilh.start_timestamp(), + ilh.duration(), + ilh.tag(), + 1_000_000.0, + ) + .unwrap(); + }); + } + + + let orig_str = str::from_utf8(&log_without_headers).unwrap(); + let rewritten_str = str::from_utf8(&duplicate_log).unwrap(); + + assert_eq!(orig_str, rewritten_str); + } + + #[test] + fn write_random_histograms_to_interval_log_then_read() { + let mut rng = rand::thread_rng(); + + let mut histograms = Vec::new(); + let mut tags = Vec::new(); + + let mut log_buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + let max_scaling_factor = 1_000_000.0; + + { + let mut writer = + IntervalLogHeaderWriter::new(&mut log_buf, &mut serializer).into_log_writer(); + + writer.write_comment("start").unwrap(); + + for i in 0_u32..100 { + let mut h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); + + for _ in 0..100 { + // ensure no count above i64::max_value(), even when many large values are + // bucketed together + h.record_n(rng.gen::() >> 32, rng.gen::() >> 32) + .unwrap(); + } + + if rng.gen() { + tags.push(Some(format!("t{}", i))); + } else { + tags.push(None); + }; + let current_tag_str = tags.last().unwrap(); + let tag = current_tag_str + .as_ref() + .map(|s| Tag::new(s.as_str()).unwrap()); + + writer + .write_histogram(&h, i as f64, (i as f64) + 10000.0, tag, max_scaling_factor) + .unwrap(); + + writer.write_comment(&format!("line {}", i)).unwrap(); + + histograms.push(h); + } + } + + let parsed = IntervalLogIterator::new(&log_buf) + .map(|r| r.unwrap()) + .filter_map(|e| match e { + LogEntry::Interval(ilh) => Some(ilh), + _ => None, + }) + .collect::>(); + + assert_eq!(histograms.len(), parsed.len()); + + let mut deserializer = Deserializer::new(); + for (index, ilh) in parsed.iter().enumerate() { + let serialized_histogram = + base64::decode_config(ilh.encoded_histogram(), base64::STANDARD).unwrap(); + let decoded_hist: Histogram = deserializer + .deserialize(&mut io::Cursor::new(&serialized_histogram)) + .unwrap(); + let original_hist = &histograms[index]; + + assert_eq!(original_hist, &decoded_hist); + + assert_eq!(index as f64, ilh.start_timestamp()); + assert_eq!((index as f64) + 10000.0, ilh.duration()); + assert_eq!(round(original_hist.max() as f64 / max_scaling_factor), ilh.max()); + let tag_string: Option = tags.get(index).unwrap().as_ref().map(|s| s.clone()); + assert_eq!(tag_string, ilh.tag().map(|t| t.as_str().to_owned())); + } + } + + /// Round to 3 digits the way floats are in the log + fn round(f: f64) -> f64 { + format!("{:.3}", f).parse::().unwrap() + } + + fn load_iterator_from_file<'a>(path: &Path) -> IntervalLogBufHolder { + let mut buf = Vec::new(); + let _ = File::open(path).unwrap().read_to_end(&mut buf).unwrap(); + + IntervalLogBufHolder { data: buf } + } + + struct IntervalLogBufHolder { + data: Vec, + } + + impl<'a> IntoIterator for &'a IntervalLogBufHolder { + type Item = Result, LogIteratorError>; + type IntoIter = IntervalLogIterator<'a>; + + fn into_iter(self) -> Self::IntoIter { + IntervalLogIterator::new(self.data.as_slice()) + } + } +} From 471be3af6d1e5c6188628df4bc55971c7bcac46c Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Thu, 26 Oct 2017 22:16:24 -0500 Subject: [PATCH 05/18] Clarify some docs --- src/serialization/interval_log/mod.rs | 13 +++++++------ src/serialization/mod.rs | 2 +- tests/interval_log.rs | 5 ++++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index f19d6d4..1ee4788 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -9,7 +9,7 @@ //! An interval log contains some initial metadata, then a sequence of histograms, each with some //! additional metadata (timestamps, etc). //! -//! To parse a log, see `IntervalLogIterator`. To write a log, see `LogHeaderWriter`. +//! To parse a log, see `IntervalLogIterator`. To write a log, see `IntervalLogHeaderWriter`. extern crate base64; @@ -21,7 +21,7 @@ use nom::{double, line_ending, not_line_ending, IResult}; use super::super::{Counter, Histogram}; use super::Serializer; -/// The starting point for writing an interval log. +/// Start writing an interval log. /// /// This type only allows writing comments and headers. Once you're done writing those things, use /// `into_log_writer()` to convert this into an `IntervalLogWriter`. @@ -58,7 +58,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogHeaderWriter<'a, /// Writes interval histograms in an interval log. /// -/// +/// This isn't created directly; start with an `IntervalLogHeaderWriter`. pub struct IntervalLogWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { internal_writer: InternalLogWriter<'a, 'b, W, S>, } @@ -211,8 +211,8 @@ impl<'a> IntervalLogHistogram<'a> { /// Timestamp of the start of the interval in seconds. /// - /// The timestamp may be absolute vs the epoch, or there may be a StartTime or BaseTime for the - /// log. + /// The timestamp may be absolute vs the epoch, or there may be a `StartTime` or `BaseTime` for + /// the log, in which case you may wish to consider this number as a delta vs those timestamps.. pub fn start_timestamp(&self) -> f64 { self.start_timestamp } @@ -232,7 +232,8 @@ impl<'a> IntervalLogHistogram<'a> { /// Base64-encoded serialized histogram. /// - /// If you need the deserialized histogram, use a `Deserializer. + /// If you need the deserialized histogram, base64-decode and use a `Deserializer` on the + /// resulting bytes. /// /// Histograms are left in their original encoding to make parsing each log entry very cheap. /// One usage pattern is to navigate to a certain point in the log and only deserialize a few diff --git a/src/serialization/mod.rs b/src/serialization/mod.rs index 30c405e..2bd16fd 100644 --- a/src/serialization/mod.rs +++ b/src/serialization/mod.rs @@ -51,7 +51,7 @@ //! //! # Interval logs //! -//! Parsing interval logs is supported via `IntervalLogIterator`. +//! See the `interval_log` module. //! //! ### Integration with general-purpose serialization libraries //! diff --git a/tests/interval_log.rs b/tests/interval_log.rs index 3b79ab6..44df4e3 100644 --- a/tests/interval_log.rs +++ b/tests/interval_log.rs @@ -273,7 +273,10 @@ mod tests { assert_eq!(index as f64, ilh.start_timestamp()); assert_eq!((index as f64) + 10000.0, ilh.duration()); - assert_eq!(round(original_hist.max() as f64 / max_scaling_factor), ilh.max()); + assert_eq!( + round(original_hist.max() as f64 / max_scaling_factor), + ilh.max() + ); let tag_string: Option = tags.get(index).unwrap().as_ref().map(|s| s.clone()); assert_eq!(tag_string, ilh.tag().map(|t| t.as_str().to_owned())); } From a6eb1ab3e421cd2a50d5118a6f416973cc17d1fd Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Fri, 27 Oct 2017 10:15:58 -0500 Subject: [PATCH 06/18] Make uniform-varint-length random number generator available to benches. I want to use that in interval log benchmarks, hence tackling that TODO. For serialization benchmarks, this is just right: we want to exercise all different varint lengths (modulo bucketing, which throws it off a little bit). For non-serialization benches (e.g. add and subtract) this is still a lot better than just regular random numbers, which will be heavily weighted towards the last few buckets. This drops measurements quite a bit, but that's expected because now histograms aren't mostly using values way in the high end that all get bucketed together, so now there's a lot more work to do when traversing the histogram. --- .gitignore | 1 + benches/record.rs | 41 +++++----- benches/serialization.rs | 25 +++--- src/serialization/rand_varint.rs | 126 +++++++++++++++++++++++++++++ src/serialization/tests.rs | 134 ++----------------------------- 5 files changed, 169 insertions(+), 158 deletions(-) create mode 100644 src/serialization/rand_varint.rs diff --git a/.gitignore b/.gitignore index ae39cda..2ad489c 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ Cargo.lock .criterion perf.data* +/tmp diff --git a/benches/record.rs b/benches/record.rs index 8289f21..758988b 100644 --- a/benches/record.rs +++ b/benches/record.rs @@ -5,22 +5,24 @@ extern crate rand; extern crate test; use hdrsample::*; -use self::rand::Rng; use self::test::Bencher; +use self::rand_varint::*; + +#[path = "../src/serialization/rand_varint.rs"] +mod rand_varint; + #[bench] fn record_precalc_random_values_with_1_count_u64(b: &mut Bencher) { let mut h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); let mut indices = Vec::::new(); - // TODO improve this and similar benchmarks to use a non-uniform distribution (like that used - // in serialization tests) so we're not always recording in the top few buckets let mut rng = rand::weak_rng(); // same value approach as record_precalc_random_values_with_max_count_u64 so that they are // comparable - for _ in 0..1000_000 { - indices.push(rng.gen()); + for v in RandomVarintEncodedLengthIter::new(&mut rng).take(1_000_000) { + indices.push(v); } b.iter(|| { @@ -39,10 +41,9 @@ fn record_precalc_random_values_with_max_count_u64(b: &mut Bencher) { // store values in an array and re-use so we can be sure to hit the overflow case - for _ in 0..1000_000 { - let r = rng.gen(); - indices.push(r); - h.record_n(r, u64::max_value()).unwrap(); + for v in RandomVarintEncodedLengthIter::new(&mut rng).take(1_000_000) { + indices.push(v); + h.record_n(v, u64::max_value()).unwrap(); } b.iter(|| { @@ -59,8 +60,8 @@ fn record_correct_precalc_random_values_with_1_count_u64(b: &mut Bencher) { let mut indices = Vec::::new(); let mut rng = rand::weak_rng(); - for _ in 0..10_000 { - indices.push(rng.gen()); + for v in RandomVarintEncodedLengthIter::new(&mut rng).take(10_000) { + indices.push(v); } b.iter(|| { @@ -79,8 +80,10 @@ fn record_random_values_with_1_count_u64(b: &mut Bencher) { // This should be *slower* than the benchmarks above where we pre-calculate the values // outside of the hot loop. If it isn't, then those measurements are likely spurious. - b.iter(|| for _ in 0..1000_000 { - h.record(rng.gen()).unwrap() + b.iter(|| { + for v in RandomVarintEncodedLengthIter::new(&mut rng).take(1_000_000) { + h.record(v).unwrap() + } }) } @@ -136,11 +139,10 @@ fn do_subtract_benchmark Histogram>( for _ in 0..1000 { let mut h = addend_factory(); - for _ in 0..1000 { - let r = rng.gen(); - h.record_n(r, count_at_each_addend_value).unwrap(); + for v in RandomVarintEncodedLengthIter::new(&mut rng).take(1_000) { + h.record_n(v, count_at_each_addend_value).unwrap(); // ensure there's a count to subtract from - accum.record_n(r, count_at_each_addend_value).unwrap(); + accum.record_n(v, count_at_each_addend_value).unwrap(); } subtrahends.push(h); @@ -166,9 +168,8 @@ fn do_add_benchmark Histogram>( for _ in 0..1000 { let mut h = addend_factory(); - for _ in 0..1000 { - let r = rng.gen(); - h.record_n(r, count_at_each_addend_value).unwrap(); + for v in RandomVarintEncodedLengthIter::new(&mut rng).take(1_000) { + h.record_n(v, count_at_each_addend_value).unwrap(); } addends.push(h); diff --git a/benches/serialization.rs b/benches/serialization.rs index 7763638..6c89fe1 100644 --- a/benches/serialization.rs +++ b/benches/serialization.rs @@ -6,11 +6,14 @@ extern crate test; use hdrsample::*; use hdrsample::serialization::*; -use self::rand::distributions::range::Range; -use self::rand::distributions::IndependentSample; use self::test::Bencher; use std::io::Cursor; +use self::rand_varint::*; + +#[path = "../src/serialization/rand_varint.rs"] +mod rand_varint; + #[bench] fn serialize_tiny_dense_v2(b: &mut Bencher) { // 256 + 3 * 128 = 640 counts @@ -173,11 +176,12 @@ fn do_serialize_bench( let random_counts = (fraction_of_counts_len * h.distinct_values() as f64) as usize; let mut vec = Vec::with_capacity(random_counts); - let range = Range::new(low, high); - let mut rng = rand::weak_rng(); - for _ in 0..random_counts { - h.record(range.ind_sample(&mut rng)).unwrap(); + for v in RandomVarintEncodedLengthIter::new(&mut rng) + .filter(|v| v >= &low && v <= &high) + .take(random_counts) + { + h.record(v).unwrap(); } b.iter(|| { @@ -201,11 +205,12 @@ fn do_deserialize_bench( let random_counts = (fraction_of_counts_len * h.distinct_values() as f64) as usize; let mut vec = Vec::with_capacity(random_counts); - let range = Range::new(low, high); - let mut rng = rand::weak_rng(); - for _ in 0..random_counts { - h.record(range.ind_sample(&mut rng)).unwrap(); + for v in RandomVarintEncodedLengthIter::new(&mut rng) + .filter(|v| v >= &low && v <= &high) + .take(random_counts) + { + h.record(v).unwrap(); } let _ = s.serialize(&h, &mut vec).unwrap(); diff --git a/src/serialization/rand_varint.rs b/src/serialization/rand_varint.rs new file mode 100644 index 0000000..9e751c9 --- /dev/null +++ b/src/serialization/rand_varint.rs @@ -0,0 +1,126 @@ +extern crate rand; + +use self::rand::Rng; +use self::rand::distributions::range::Range; +use self::rand::distributions::IndependentSample; + +/// Smallest number in our varint encoding that takes the given number of bytes +pub fn smallest_number_in_n_byte_varint(byte_length: usize) -> u64 { + assert!(byte_length <= 9 && byte_length >= 1); + + match byte_length { + 1 => 0, + // one greater than the largest of the previous length + _ => largest_number_in_n_byte_varint(byte_length - 1) + 1, + } +} + +/// Largest number in our varint encoding that takes the given number of bytes +pub fn largest_number_in_n_byte_varint(byte_length: usize) -> u64 { + assert!(byte_length <= 9 && byte_length >= 1); + + match byte_length { + 9 => u64::max_value(), + _ => largest_number_in_7_bit_chunk(byte_length - 1), + } +} + +/// The largest in the set of numbers that have at least 1 bit set in the n'th chunk of 7 bits. +fn largest_number_in_7_bit_chunk(chunk_index: usize) -> u64 { + // Our 9-byte varints do different encoding in the last byte, so we don't handle them here + assert!(chunk_index <= 7); + + // 1 in every bit below the lowest bit in this chunk + let lower_bits = match chunk_index { + 0 => 0, + _ => largest_number_in_7_bit_chunk(chunk_index - 1), + }; + + // 1 in every bit in this chunk + let this_chunk = 0x7F_u64 << (chunk_index * 7); + + lower_bits | this_chunk +} + +// Evenly distributed random numbers end up biased heavily towards longer encoded byte lengths: +// there are a lot more large numbers than there are small (duh), but for exercising serialization +// code paths, we'd like many at all byte lengths. This is also arguably more representative of +// real data. This should emit values whose varint lengths are uniformly distributed across the +// whole length range (1 to 9). +pub struct RandomVarintEncodedLengthIter { + ranges: [Range; 9], + range_for_picking_range: Range, + rng: R, +} + +impl RandomVarintEncodedLengthIter { + pub fn new(rng: R) -> RandomVarintEncodedLengthIter { + RandomVarintEncodedLengthIter { + ranges: [ + Range::new( + smallest_number_in_n_byte_varint(1), + largest_number_in_n_byte_varint(1) + 1, + ), + Range::new( + smallest_number_in_n_byte_varint(2), + largest_number_in_n_byte_varint(2) + 1, + ), + Range::new( + smallest_number_in_n_byte_varint(3), + largest_number_in_n_byte_varint(3) + 1, + ), + Range::new( + smallest_number_in_n_byte_varint(4), + largest_number_in_n_byte_varint(4) + 1, + ), + Range::new( + smallest_number_in_n_byte_varint(5), + largest_number_in_n_byte_varint(5) + 1, + ), + Range::new( + smallest_number_in_n_byte_varint(6), + largest_number_in_n_byte_varint(6) + 1, + ), + Range::new( + smallest_number_in_n_byte_varint(7), + largest_number_in_n_byte_varint(7) + 1, + ), + Range::new( + smallest_number_in_n_byte_varint(8), + largest_number_in_n_byte_varint(8) + 1, + ), + Range::new( + smallest_number_in_n_byte_varint(9), + largest_number_in_n_byte_varint(9), + ), + ], + range_for_picking_range: Range::new(0, 9), + rng, + } + } +} + +impl Iterator for RandomVarintEncodedLengthIter { + type Item = u64; + + fn next(&mut self) -> Option { + // pick the range we'll use + let value_range = self.ranges[self.range_for_picking_range.ind_sample(&mut self.rng)]; + + Some(value_range.ind_sample(&mut self.rng)) + } +} + +#[test] +fn largest_number_in_7_bit_chunk_correct() { + // 8 chunks (indices 0-7) of 7 bits gets you to 56 bits. Last byte in varint is handled + // differently, so we don't test that here. + for i in 0..8 { + let largest = largest_number_in_7_bit_chunk(i); + assert_eq!((i as u32 + 1) * 7, largest.count_ones()); + + assert_eq!(64 - ((i as u32) + 1) * 7, largest.leading_zeros()); + // any larger and it will be in the next chunk + assert_eq!(largest.leading_zeros() - 1, (largest + 1).leading_zeros()); + } +} diff --git a/src/serialization/tests.rs b/src/serialization/tests.rs index 8298d27..a314d71 100644 --- a/src/serialization/tests.rs +++ b/src/serialization/tests.rs @@ -16,6 +16,11 @@ use self::rand::{Rand, Rng}; use self::rand::distributions::range::{Range, SampleRange}; use self::rand::distributions::IndependentSample; +use self::rand_varint::*; + +#[path = "rand_varint.rs"] +mod rand_varint; + #[test] fn serialize_all_zeros() { let h = histo64(1, 2047, 3); @@ -479,22 +484,6 @@ fn zig_zag_roundtrip_random() { } } -// Test that varint test helpers are correct - -#[test] -fn largest_number_in_7_bit_chunk_correct() { - // 8 chunks (indices 0-7) of 7 bits gets you to 56 bits. Last byte in varint is handled - // differently, so we don't test that here. - for i in 0..8 { - let largest = largest_number_in_7_bit_chunk(i); - assert_eq!((i as u32 + 1) * 7, largest.count_ones()); - - assert_eq!(64 - ((i as u32) + 1) * 7, largest.leading_zeros()); - // any larger and it will be in the next chunk - assert_eq!(largest.leading_zeros() - 1, (largest + 1).leading_zeros()); - } -} - fn do_varint_write_read_roundtrip_rand(byte_length: usize) { assert!(byte_length <= 9 && byte_length >= 1); @@ -635,45 +624,6 @@ fn assert_deserialized_histogram_matches_orig( ); } -/// Smallest number in our varint encoding that takes the given number of bytes -fn smallest_number_in_n_byte_varint(byte_length: usize) -> u64 { - assert!(byte_length <= 9 && byte_length >= 1); - - match byte_length { - 1 => 0, - // one greater than the largest of the previous length - _ => largest_number_in_n_byte_varint(byte_length - 1) + 1, - } -} - -/// Largest number in our varint encoding that takes the given number of bytes -fn largest_number_in_n_byte_varint(byte_length: usize) -> u64 { - assert!(byte_length <= 9 && byte_length >= 1); - - match byte_length { - 9 => u64::max_value(), - _ => largest_number_in_7_bit_chunk(byte_length - 1), - } -} - -/// The largest in the set of numbers that have at least 1 bit set in the n'th chunk of 7 bits. -fn largest_number_in_7_bit_chunk(chunk_index: usize) -> u64 { - // Our 9-byte varints do different encoding in the last byte, so we don't handle them here - assert!(chunk_index <= 7); - - // 1 in every bit below the lowest bit in this chunk - let lower_bits = match chunk_index { - 0 => 0, - _ => largest_number_in_7_bit_chunk(chunk_index - 1), - }; - - // 1 in every bit in this chunk - let this_chunk = 0x7F_u64 << (chunk_index * 7); - - lower_bits | this_chunk -} - - struct RandomRangeIter { range: Range, rng: R, @@ -681,10 +631,7 @@ struct RandomRangeIter { impl RandomRangeIter { fn new(rng: R, range: Range) -> RandomRangeIter { - RandomRangeIter { - rng: rng, - range: range, - } + RandomRangeIter { rng, range } } } @@ -695,72 +642,3 @@ impl Iterator for RandomRangeIter { Some(self.range.ind_sample(&mut self.rng)) } } - -// Evenly distributed random numbers end up biased heavily towards longer encoded byte lengths: -// there are a lot more large numbers than there are small (duh), but for exercising serialization -// code paths, we'd like many at all byte lengths. This is also arguably more representative of -// real data. This should emit values whose varint lengths are uniformly distributed across the -// whole length range (1 to 9). -struct RandomVarintEncodedLengthIter { - ranges: [Range; 9], - range_for_picking_range: Range, - rng: R, -} - -impl RandomVarintEncodedLengthIter { - fn new(rng: R) -> RandomVarintEncodedLengthIter { - RandomVarintEncodedLengthIter { - ranges: [ - Range::new( - smallest_number_in_n_byte_varint(1), - largest_number_in_n_byte_varint(1) + 1, - ), - Range::new( - smallest_number_in_n_byte_varint(2), - largest_number_in_n_byte_varint(2) + 1, - ), - Range::new( - smallest_number_in_n_byte_varint(3), - largest_number_in_n_byte_varint(3) + 1, - ), - Range::new( - smallest_number_in_n_byte_varint(4), - largest_number_in_n_byte_varint(4) + 1, - ), - Range::new( - smallest_number_in_n_byte_varint(5), - largest_number_in_n_byte_varint(5) + 1, - ), - Range::new( - smallest_number_in_n_byte_varint(6), - largest_number_in_n_byte_varint(6) + 1, - ), - Range::new( - smallest_number_in_n_byte_varint(7), - largest_number_in_n_byte_varint(7) + 1, - ), - Range::new( - smallest_number_in_n_byte_varint(8), - largest_number_in_n_byte_varint(8) + 1, - ), - Range::new( - smallest_number_in_n_byte_varint(9), - largest_number_in_n_byte_varint(9), - ), - ], - range_for_picking_range: Range::new(0, 9), - rng: rng, - } - } -} - -impl Iterator for RandomVarintEncodedLengthIter { - type Item = u64; - - fn next(&mut self) -> Option { - // pick the range we'll use - let value_range = self.ranges[self.range_for_picking_range.ind_sample(&mut self.rng)]; - - Some(value_range.ind_sample(&mut self.rng)) - } -} From 5553e18b9ad2c46a8f54969f127a99559d672053 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Fri, 27 Oct 2017 10:50:04 -0500 Subject: [PATCH 07/18] Add interval log benchmark. It's fast enough that I'm not stressing about optimizing it further. --- benches/interval_log.rs | 78 +++++++++++++++++++++++++++ src/serialization/interval_log/mod.rs | 12 +++-- 2 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 benches/interval_log.rs diff --git a/benches/interval_log.rs b/benches/interval_log.rs new file mode 100644 index 0000000..77b7ca7 --- /dev/null +++ b/benches/interval_log.rs @@ -0,0 +1,78 @@ +#![feature(test)] + +extern crate hdrsample; +extern crate rand; +extern crate test; + +use hdrsample::*; +use hdrsample::serialization; +use hdrsample::serialization::interval_log; +use test::Bencher; + +use self::rand_varint::*; + +#[path = "../src/serialization/rand_varint.rs"] +mod rand_varint; + +#[bench] +fn write_interval_log_1k_hist_10k_value(b: &mut Bencher) { + let mut log = Vec::new(); + let mut histograms = Vec::new(); + let mut rng = rand::weak_rng(); + + for _ in 0..1000 { + let mut h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); + + for v in RandomVarintEncodedLengthIter::new(&mut rng).take(10_000) { + h.record(v).unwrap(); + } + + histograms.push(h); + } + + let mut serializer = serialization::V2Serializer::new(); + + b.iter(|| { + log.clear(); + + let mut writer = + interval_log::IntervalLogHeaderWriter::new(&mut log, &mut serializer).into_log_writer(); + + for h in histograms.iter() { + writer.write_histogram(h, 1.234, 5.678, None, 1.0).unwrap(); + } + }) +} + +#[bench] +fn parse_interval_log_1k_hist_10k_value(b: &mut Bencher) { + let mut log = Vec::new(); + let mut histograms = Vec::new(); + let mut rng = rand::weak_rng(); + + for _ in 0..1000 { + let mut h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); + + for v in RandomVarintEncodedLengthIter::new(&mut rng).take(10_000) { + h.record(v).unwrap(); + } + + histograms.push(h); + } + + { + let mut serializer = serialization::V2Serializer::new(); + let mut writer = + interval_log::IntervalLogHeaderWriter::new(&mut log, &mut serializer).into_log_writer(); + + for h in histograms.iter() { + writer.write_histogram(h, 1.234, 5.678, None, 1.0).unwrap(); + } + } + + b.iter(|| { + let iter = interval_log::IntervalLogIterator::new(&log); + + assert_eq!(1000, iter.count()); + }) +} diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index 1ee4788..096fa51 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -234,10 +234,6 @@ impl<'a> IntervalLogHistogram<'a> { /// /// If you need the deserialized histogram, base64-decode and use a `Deserializer` on the /// resulting bytes. - /// - /// Histograms are left in their original encoding to make parsing each log entry very cheap. - /// One usage pattern is to navigate to a certain point in the log and only deserialize a few - /// interesting histograms, so it would be inefficient to deserialize them at log parse time. pub fn encoded_histogram(&self) -> &'a str { self.encoded_histogram } @@ -275,6 +271,14 @@ pub enum LogIteratorError { /// This iterator exposes each item (excluding comments and other information-free lines). See /// `LogEntry`. /// +/// Because histogram deserialization is deferred, parsing logs is fast. (See the `interval_log` +/// benchmark if you wish to see how it does on your hardware. As a baseline, parsing a log of 1000 +/// random histograms of 10,000 values each takes 8ms total on an E5-1650v3.) +/// +/// Deferring deserialization is handy because it allows you to cheaply navigate the log to find +/// the records you care about (e.g. ones in a certain time range, or with a certain tag) without +/// doing all the allocation, etc, of deserialization. +/// /// This parses from a slice representing the complete file because it made implementation easier /// (and also supports mmap'd files for maximum parsing speed). If parsing from a `Read` is /// important for your use case, open an issue about it. From 00a5ee1602775ad0fbee5bfe3159b69bd1ea880f Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Sat, 28 Oct 2017 22:32:24 -0500 Subject: [PATCH 08/18] Add docs explaining BaseTime and StartTime --- src/serialization/interval_log/mod.rs | 255 +++++++++++++++++++----- src/serialization/interval_log/tests.rs | 51 ++++- 2 files changed, 248 insertions(+), 58 deletions(-) diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index 096fa51..e42be0d 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -7,13 +7,161 @@ //! see in logs, etc. //! //! An interval log contains some initial metadata, then a sequence of histograms, each with some -//! additional metadata (timestamps, etc). +//! additional metadata (timestamps, etc). See `IntervalLogHistogram`. +//! +//! The intervals in the log should be ordered by start timestamp. It's possible to write (and +//! parse) logs with intervals in any order, but the expectation is that they will be sorted. //! //! To parse a log, see `IntervalLogIterator`. To write a log, see `IntervalLogHeaderWriter`. +//! +//! # Timestamps +//! +//! Each interval has a timestamp in seconds associated with it. However, it's not necessarily as +//! simple as just interpreting the number as seconds since the epoch. +//! +//! There are two optional pieces of header metadata: "StartTime" and "BaseTime". Neither, one, or +//! both of these may be present. It is possible to have multiple StartTime or BaseTime entries in +//! the log, but that is discouraged as it is confusing to interpret. It is also possible to have +//! StartTime and BaseTime interleaved with interval histograms, but that is even more confusing, so +//! this API prevents you from doing so. +//! +//! ### Timestamp options +//! +//! This is a summary of the logic used by the Java impl's `HistogramLogReader` for StartTime and +//! BaseTime. +//! +//! - Neither are present: interval timestamps should be interpreted as seconds since the epoch. +//! - StartTime is present: StartTime is a number of seconds since epoch, and interval timestamps +//! should be interpreted as deltas that could be added to StartTime if seconds since epoch for each +//! interval is needed. +//! - BaseTime is present: same as the case where StartTime is present. It's seconds since epoch, +//! with interval timestamps as deltas. +//! - BaseTime and StartTime are present: The BaseTime is used like it is when it's the only one +//! present: it's a number of seconds since epoch that serves as the starting point for the +//! per-interval deltas to get a wall-clock time for each interval. The StartTime is a *different* +//! number of seconds since epoch whose meaning is really up to the user. One hypothetical use might +//! be if you're performing a long-running benchmark and outputting a new interval log every hour. +//! The BaseTime of each log would be the seconds since epoch at the creation time of that log file, +//! but the StartTime would be the same for each file: the time that the benchmark started. Thus, +//! if you wanted to find the interval histogram for 4000 seconds into the benchmark, you would load +//! the second hour's file, add each interval's timestamp to that log's BaseTime, and select the one +//! whose (timestmap + BaseTime) was 4000 bigger than the StartTime. This seems to be how the Java +//! impl uses it: `HistogramLogReader` lets you filter by "non-absolute" start/end time or by +//! "absolute" start/end time. The former uses a range of deltas from StartTime and selects +//! intervals where `interval_timestamp + base_time - start_time` is in the requested range, while +//! the latter uses a range of absolute timestamps and selects via `interval_timestamp + base_time`. +//! +//! ### Timestamp recommendations +//! +//! As you can see from that slab of text, using both BaseTime and StartTime is complex. +//! +//! We suggest one of the following: +//! +//! - Don't use a timestamp header, and simply have each interval's timestamp be the seconds since +//! epoch. +//! - Use StartTime, and have each interval's timestamp be a delta from StartTime. +//! +//! Of course, if you are writing logs that need to work with an existing log processing pipeline, +//! you should use timestamps as expected by that logic, so we provide the ability to have all +//! combinations of timestamp headers if need be. +//! +//! # Examples +//! +//! Parse a single interval from a log. +//! +//! ``` +//! use hdrsample::serialization::interval_log; +//! +//! // two newline-separated log lines: a comment, then an interval +//! let log = b"#I'm a comment\nTag=t,0.127,1.007,2.769,base64EncodedHisto\n"; +//! +//! let mut iter = interval_log::IntervalLogIterator::new(&log[..]); +//! +//! // the comment is consumed and ignored by the parser, so the first event is an Interval +//! match iter.next().unwrap().unwrap() { +//! interval_log::LogEntry::Interval(h) => { +//! assert_eq!(0.127, h.start_timestamp()); +//! } +//! _ => panic!() +//! } +//! +//! // there are no more lines in the log; iteration complete +//! assert_eq!(None, iter.next()); +//! ``` +//! +//! Skip logs that started before 3 seconds. +//! +//! ``` +//! use hdrsample::serialization::interval_log; +//! +//! let mut log = Vec::new(); +//! log.extend_from_slice(b"#I'm a comment\n"); +//! log.extend_from_slice(b"Tag=a,0.123,1.007,2.769,base64EncodedHisto\n"); +//! log.extend_from_slice(b"1.456,1.007,2.769,base64EncodedHisto\n"); +//! log.extend_from_slice(b"3.789,1.007,2.769,base64EncodedHisto\n"); +//! log.extend_from_slice(b"Tag=b,4.123,1.007,2.769,base64EncodedHisto\n"); +//! log.extend_from_slice(b"5.456,1.007,2.769,base64EncodedHisto\n"); +//! log.extend_from_slice(b"#Another comment\n"); +//! +//! let iter = interval_log::IntervalLogIterator::new(&log); +//! +//! let count = iter.map(|r| r.unwrap()) +//! // only look at intervals (which are the only non-comment lines in this log) +//! .filter_map(|e| match e { +//! interval_log::LogEntry::Interval(ilh) => Some(ilh), +//! _ => None +//! }) +//! // do any filtering you want +//! .filter(|ilh| ilh.start_timestamp() >= 3.0) +//! .count(); +//! +//! assert_eq!(3, count); +//! ``` +//! +//! Write a log. +//! +//! ``` +//! use std::str; +//! use hdrsample; +//! use hdrsample::serialization; +//! use hdrsample::serialization::interval_log; +//! +//! let mut buf = Vec::new(); +//! let mut serializer = serialization::V2Serializer::new(); +//! +//! let mut h = hdrsample::Histogram::::new_with_bounds( +//! 1, u64::max_value(), 3).unwrap(); +//! h.record(12345).unwrap(); +//! +//! // limit scope of mutable borrow of `buf` +//! { +//! let mut header_writer = interval_log::IntervalLogHeaderWriter::new( +//! &mut buf, &mut serializer); +//! header_writer.write_comment("Comments are great").unwrap(); +//! header_writer.write_start_time(123.456789).unwrap(); +//! +//! let mut log_writer = header_writer.into_log_writer(); +//! +//! log_writer.write_comment( +//! "You can have comments anywhere in the log").unwrap(); +//! +//! log_writer +//! .write_histogram( +//! &h, +//! 1.234, +//! 5.678, +//! interval_log::Tag::new("im-a-tag"), +//! 1.0) +//! .unwrap(); +//! } +//! +//! // `buf` is now full of stuff; we check for the first line +//! assert_eq!("#Comments are great\n", &str::from_utf8(&buf).unwrap()[0..20]); +//! ``` extern crate base64; -use std::{io, ops, str}; +use std::{fmt, io, ops, str}; use std::fmt::Write; use nom::{double, line_ending, not_line_ending, IResult}; @@ -21,7 +169,7 @@ use nom::{double, line_ending, not_line_ending, IResult}; use super::super::{Counter, Histogram}; use super::Serializer; -/// Start writing an interval log. +/// Write headers for an interval log. /// /// This type only allows writing comments and headers. Once you're done writing those things, use /// `into_log_writer()` to convert this into an `IntervalLogWriter`. @@ -43,10 +191,33 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogHeaderWriter<'a, } /// Add a comment line. + /// + /// If you do silly things like write a comment with a newline character, you'll end up with + /// an un-parseable log file. Don't do that. pub fn write_comment(&mut self, s: &str) -> io::Result<()> { self.internal_writer.write_comment(s) } + /// Write a StartTime log line. See the module-level documentation for more info. + /// + /// This should only be called once to avoid creating a confusing interval log. + pub fn write_start_time(&mut self, seconds_since_epoch: f64) -> io::Result<()> { + self.internal_writer.write_fmt(format_args!( + "#[StartTime: {:.3} (seconds since epoch)]\n", + seconds_since_epoch + )) + } + + /// Write a BaseTime log line. See the module-level documentation for more info. + /// + /// This should only be called once to avoid creating a confusing interval log. + pub fn write_base_time(&mut self, seconds_since_epoch: f64) -> io::Result<()> { + self.internal_writer.write_fmt(format_args!( + "#[BaseTime: {:.3} (seconds since epoch)]\n", + seconds_since_epoch + )) + } + /// Once you're finished with headers, convert this into a log writer so you can write interval /// histograms. pub fn into_log_writer(self) -> IntervalLogWriter<'a, 'b, W, S> { @@ -58,7 +229,9 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogHeaderWriter<'a, /// Writes interval histograms in an interval log. /// -/// This isn't created directly; start with an `IntervalLogHeaderWriter`. +/// This isn't created directly; start with an `IntervalLogHeaderWriter`. Once you've written the +/// headers and ended up with an `IntervalLogWriter`, typical usage would be to write a histogram +/// at regular intervals (e.g. once a second). pub struct IntervalLogWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { internal_writer: InternalLogWriter<'a, 'b, W, S>, } @@ -72,12 +245,16 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogWriter<'a, 'b, W, /// Write an interval histogram. /// /// `start_timestamp` is the time since the epoch in seconds. If you're using a StartTime or - /// BaseTime offset, you should instead use a delta since that time. + /// BaseTime offset, you should instead use a delta since that time. See the discussion about + /// timestamps in the module-level documentation. + /// /// `duration` is the duration of the interval in seconds. + /// /// `tag` is an optional tag for this histogram. + /// /// `max_value_divisor` is used to scale down the max value to something that may be more human /// readable. The max value in the log is only for human consumption, so you might prefer to - /// divide by 10^9 to turn nanoseconds into fractional seconds, for instance. + /// divide by 109 to turn nanoseconds into fractional seconds, for instance. pub fn write_histogram( &mut self, h: &Histogram, @@ -115,6 +292,10 @@ struct InternalLogWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { } impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, S> { + fn write_fmt(&mut self, args: fmt::Arguments) -> io::Result<()> { + self.writer.write_fmt(args) + } + fn write_comment(&mut self, s: &str) -> io::Result<()> { write!(self.writer, "#{}\n", s) } @@ -212,7 +393,8 @@ impl<'a> IntervalLogHistogram<'a> { /// Timestamp of the start of the interval in seconds. /// /// The timestamp may be absolute vs the epoch, or there may be a `StartTime` or `BaseTime` for - /// the log, in which case you may wish to consider this number as a delta vs those timestamps.. + /// the log, in which case you may wish to consider this number as a delta vs those timestamps. + /// See the module-level documentation about timestamps. pub fn start_timestamp(&self) -> f64 { self.start_timestamp } @@ -279,56 +461,21 @@ pub enum LogIteratorError { /// the records you care about (e.g. ones in a certain time range, or with a certain tag) without /// doing all the allocation, etc, of deserialization. /// +/// If you're looking for a direct port of the Java impl's `HistogramLogReader`, this isn't one: it +/// won't deserialize for you, and it pushes the burden of figuring out what to do with StartTime, +/// BaseTime, etc to you, and there aren't built in functions to filter by timestamp. On the other +/// hand, because it doesn't do those things, it is much more flexible: you can easily build any +/// sort of filtering you want, not just timestamp ranges, because you have cheap access to all the +/// metadata before incurring the cost of deserialization. If you're not using any timestamp +/// headers, or at least using them in straightforward ways, it is easy to accumulate the +/// timestamp state you need. Since all the parsing is taken care of already, writing your own +/// `HistogramLogReader` equivalent that fits the way your logs are assembled is just a couple of +/// lines. (And if you're doing complex stuff, we probably wouldn't have built something that fits +/// your quirky logs anyway!) +/// /// This parses from a slice representing the complete file because it made implementation easier /// (and also supports mmap'd files for maximum parsing speed). If parsing from a `Read` is /// important for your use case, open an issue about it. -/// -/// # Examples -/// -/// Parse a single interval from a log. -/// -/// ``` -/// use hdrsample::serialization::interval_log; -/// // two newline-separated log lines: a comment, then an interval -/// let log = b"#I'm a comment\nTag=t,0.127,1.007,2.769,base64EncodedHisto\n"; -/// -/// let mut iter = interval_log::IntervalLogIterator::new(&log[..]); -/// -/// match iter.next().unwrap().unwrap() { -/// interval_log::LogEntry::Interval(h) => { -/// assert_eq!(0.127, h.start_timestamp()); -/// } -/// _ => panic!() -/// } -/// -/// assert_eq!(None, iter.next()); -/// ``` -/// -/// Skip logs that started before 3 seconds. -/// -/// ``` -/// use hdrsample::serialization::interval_log; -/// let mut log = Vec::new(); -/// log.extend_from_slice(b"#I'm a comment\n"); -/// log.extend_from_slice(b"Tag=a,0.123,1.007,2.769,base64EncodedHisto\n"); -/// log.extend_from_slice(b"1.456,1.007,2.769,base64EncodedHisto\n"); -/// log.extend_from_slice(b"3.789,1.007,2.769,base64EncodedHisto\n"); -/// log.extend_from_slice(b"Tag=b,4.123,1.007,2.769,base64EncodedHisto\n"); -/// log.extend_from_slice(b"5.456,1.007,2.769,base64EncodedHisto\n"); -/// log.extend_from_slice(b"#Another comment\n"); -/// -/// let iter = interval_log::IntervalLogIterator::new(&log); -/// -/// let count = iter.map(|r| r.unwrap()) -/// .filter_map(|e| match e { -/// interval_log::LogEntry::Interval(ilh) => Some(ilh), -/// _ => None -/// }) -/// .filter(|ilh| ilh.start_timestamp() >= 3.0) -/// .count(); -/// -/// assert_eq!(3, count); -/// ``` pub struct IntervalLogIterator<'a> { orig_len: usize, input: &'a [u8], diff --git a/src/serialization/interval_log/tests.rs b/src/serialization/interval_log/tests.rs index 6d2be6c..e6ccf60 100644 --- a/src/serialization/interval_log/tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -77,7 +77,39 @@ fn write_interval_histo_with_tag() { } #[test] -fn parse_start_time() { +fn write_start_time() { + let mut buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + { + let mut header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); + header_writer.write_start_time(123.456789).unwrap(); + } + + assert_eq!( + "#[StartTime: 123.457 (seconds since epoch)]\n", + str::from_utf8(&buf[..]).unwrap() + ); +} + +#[test] +fn write_base_time() { + let mut buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + { + let mut header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); + header_writer.write_base_time(123.456789).unwrap(); + } + + assert_eq!( + "#[BaseTime: 123.457 (seconds since epoch)]\n", + str::from_utf8(&buf[..]).unwrap() + ); +} + +#[test] +fn parse_start_time_with_human_date() { let (rest, e) = start_time( b"#[StartTime: 1441812279.474 (seconds since epoch), Wed Sep 09 08:24:39 PDT 2015]\nfoo", ).unwrap(); @@ -88,11 +120,22 @@ fn parse_start_time() { assert_eq!(b"foo", rest); } +#[test] +fn parse_start_time_without_human_date() { + // Can't be bothered to format a timestamp for humans, so we don't write that data. It's just + // another part that could be wrong -- what if it disagrees with the seconds since epoch? + // Also, BaseTime doesn't have a human-formatted time. + let (rest, e) = start_time(b"#[StartTime: 1441812279.474 (seconds since epoch)]\nfoo").unwrap(); + + let expected = LogEntry::StartTime(1441812279.474); + + assert_eq!(expected, e); + assert_eq!(b"foo", rest); +} + #[test] fn parse_base_time() { - let (rest, e) = base_time( - b"#[BaseTime: 1441812279.474 (seconds since epoch), Wed Sep 09 08:24:39 PDT 2015]\nfoo", - ).unwrap(); + let (rest, e) = base_time(b"#[BaseTime: 1441812279.474 (seconds since epoch)]\nfoo").unwrap(); let expected = LogEntry::BaseTime(1441812279.474); From 686e350d3c77d950d074d39ff5f035c52e2d1c87 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Tue, 31 Oct 2017 10:20:04 -0500 Subject: [PATCH 09/18] Bring back compiler warnings --- src/lib.rs | 2 +- src/serialization/interval_log/mod.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index b35b5c5..6f66f54 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -182,7 +182,7 @@ //! ergonomic as possible. #![deny(missing_docs, trivial_casts, trivial_numeric_casts, unused_extern_crates, - unused_import_braces, unused_results)] + unused_import_braces, unused_results, variant_size_differences, warnings)] // Enable feature(test) is enabled so that we can have benchmarks of private code #![cfg_attr(all(test, feature = "bench_private"), feature(test))] diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index e42be0d..892fe08 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -429,6 +429,7 @@ impl<'a> IntervalLogHistogram<'a> { /// will be represented by the iterator providing the corresponding variants here. The presence /// of those timestamps will affect how you should interpret the timestamps for individual /// intervals. +#[allow(variant_size_differences)] pub enum LogEntry<'a> { /// Logs may include a StartTime. If present, it represents seconds since the epoch. StartTime(f64), From ed8cda68ec3595f1a816490b431404a4a7664ab2 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Tue, 31 Oct 2017 21:43:48 -0500 Subject: [PATCH 10/18] Fix a few doc quirks and other minor cleanup --- src/lib.rs | 3 --- src/serialization/interval_log/mod.rs | 21 ++++++++++----------- src/serialization/interval_log/tests.rs | 7 ++++--- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6f66f54..9ab2df3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1751,9 +1751,6 @@ where // TODO: shift // TODO: hash -// TODO: encoding/decoding -// TODO: timestamps and tags -// TODO: textual output #[path = "tests/tests.rs"] #[cfg(test)] diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index 892fe08..e9b33af 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -244,9 +244,10 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogWriter<'a, 'b, W, /// Write an interval histogram. /// - /// `start_timestamp` is the time since the epoch in seconds. If you're using a StartTime or - /// BaseTime offset, you should instead use a delta since that time. See the discussion about - /// timestamps in the module-level documentation. + /// `start_timestamp` is the time since the epoch in seconds that measurements started being + /// recorded in this interval. If you're using a StartTime or BaseTime offset, you should + /// instead use a delta since that time. See the discussion about timestamps in the module-level + /// documentation. /// /// `duration` is the duration of the interval in seconds. /// @@ -428,7 +429,7 @@ impl<'a> IntervalLogHistogram<'a> { /// written with a StartTime or BaseTime, that metadata will appear in header comments, and that /// will be represented by the iterator providing the corresponding variants here. The presence /// of those timestamps will affect how you should interpret the timestamps for individual -/// intervals. +/// intervals. See the module-level documentation. #[allow(variant_size_differences)] pub enum LogEntry<'a> { /// Logs may include a StartTime. If present, it represents seconds since the epoch. @@ -454,9 +455,9 @@ pub enum LogIteratorError { /// This iterator exposes each item (excluding comments and other information-free lines). See /// `LogEntry`. /// -/// Because histogram deserialization is deferred, parsing logs is fast. (See the `interval_log` +/// Because histogram deserialization is deferred, parsing logs is fast. See the `interval_log` /// benchmark if you wish to see how it does on your hardware. As a baseline, parsing a log of 1000 -/// random histograms of 10,000 values each takes 8ms total on an E5-1650v3.) +/// random histograms of 10,000 values each takes 8ms total on an E5-1650v3. /// /// Deferring deserialization is handy because it allows you to cheaply navigate the log to find /// the records you care about (e.g. ones in a certain time range, or with a certain tag) without @@ -496,22 +497,20 @@ impl<'a> Iterator for IntervalLogIterator<'a> { type Item = Result, LogIteratorError>; fn next(&mut self) -> Option { - // Look for magic comments first otherwise they will get matched by the simple comment - // parser loop { if self.input.is_empty() { return None; } + // Look for magic comments first otherwise they will get matched by the simple comment + // parser if let IResult::Done(rest, e) = log_entry(self.input) { self.input = rest; return Some(Ok(e)); } // it wasn't a log entry; try parsing a comment - - let ignored_line_result = ignored_line(self.input); - match ignored_line_result { + match ignored_line(self.input) { IResult::Done(rest, _) => { self.input = rest; continue; diff --git a/src/serialization/interval_log/tests.rs b/src/serialization/interval_log/tests.rs index e6ccf60..fead108 100644 --- a/src/serialization/interval_log/tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -37,19 +37,20 @@ fn write_interval_histo_no_tag() { let mut buf = Vec::new(); let mut serializer = V2Serializer::new(); - let h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); + let mut h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); + h.record(1000).unwrap(); { let header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); let mut log_writer = header_writer.into_log_writer(); log_writer - .write_histogram(&h, 1.2345678, 5.67, None, 1.0) + .write_histogram(&h, 1.2345678, 5.67, None, 10.0) .unwrap(); } assert_eq!( - "1.235,5.670,0.000,HISTEwAAAAEAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAAA=\n", + "1.235,5.670,100.000,HISTEwAAAAMAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAM8PAg==\n", str::from_utf8(&buf[..]).unwrap() ); } From 94574f359bd5d223b6ee59211797772e5d0071fe Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Thu, 2 Nov 2017 17:49:19 -0500 Subject: [PATCH 11/18] Address PR feedback, part 1. - Doc test tidying - Use `time::Duration` for interval duration --- src/serialization/interval_log/mod.rs | 47 +++++++++++++++---------- src/serialization/interval_log/tests.rs | 18 ++++++++-- src/serialization/rand_varint.rs | 3 ++ tests/interval_log.rs | 40 +++++++++++++++++---- 4 files changed, 80 insertions(+), 28 deletions(-) diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index e9b33af..46b04c2 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -94,21 +94,22 @@ //! ``` //! use hdrsample::serialization::interval_log; //! -//! let mut log = Vec::new(); -//! log.extend_from_slice(b"#I'm a comment\n"); -//! log.extend_from_slice(b"Tag=a,0.123,1.007,2.769,base64EncodedHisto\n"); -//! log.extend_from_slice(b"1.456,1.007,2.769,base64EncodedHisto\n"); -//! log.extend_from_slice(b"3.789,1.007,2.769,base64EncodedHisto\n"); -//! log.extend_from_slice(b"Tag=b,4.123,1.007,2.769,base64EncodedHisto\n"); -//! log.extend_from_slice(b"5.456,1.007,2.769,base64EncodedHisto\n"); -//! log.extend_from_slice(b"#Another comment\n"); +//! let mut log = "\ +//! #I'm a comment\n\ +//! Tag=a,0.123,1.007,2.769,base64EncodedHisto\n\ +//! 1.456,1.007,2.769,base64EncodedHisto\n\ +//! 3.789,1.007,2.769,base64EncodedHisto\n\ +//! Tag=b,4.123,1.007,2.769,base64EncodedHisto\n\ +//! 5.456,1.007,2.769,base64EncodedHisto\n\ +//! #Another comment\n" +//! .as_bytes(); //! //! let iter = interval_log::IntervalLogIterator::new(&log); //! -//! let count = iter.map(|r| r.unwrap()) +//! let count = iter //! // only look at intervals (which are the only non-comment lines in this log) //! .filter_map(|e| match e { -//! interval_log::LogEntry::Interval(ilh) => Some(ilh), +//! Ok(interval_log::LogEntry::Interval(ilh)) => Some(ilh), //! _ => None //! }) //! // do any filtering you want @@ -121,7 +122,7 @@ //! Write a log. //! //! ``` -//! use std::str; +//! use std::{str, time}; //! use hdrsample; //! use hdrsample::serialization; //! use hdrsample::serialization::interval_log; @@ -149,7 +150,7 @@ //! .write_histogram( //! &h, //! 1.234, -//! 5.678, +//! time::Duration::new(12, 345_678_901), //! interval_log::Tag::new("im-a-tag"), //! 1.0) //! .unwrap(); @@ -161,7 +162,7 @@ extern crate base64; -use std::{fmt, io, ops, str}; +use std::{fmt, io, ops, str, time}; use std::fmt::Write; use nom::{double, line_ending, not_line_ending, IResult}; @@ -260,7 +261,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogWriter<'a, 'b, W, &mut self, h: &Histogram, start_timestamp: f64, - duration: f64, + duration: time::Duration, tag: Option, max_value_divisor: f64, ) -> Result<(), IntervalLogWriterError> { @@ -305,7 +306,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, &mut self, h: &Histogram, start_timestamp: f64, - duration: f64, + duration: time::Duration, tag: Option, max_value_divisor: f64, ) -> Result<(), IntervalLogWriterError> { @@ -321,7 +322,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, "{}{:.3},{:.3},{:.3},", self.text_buf, start_timestamp, - duration, + duration_as_fp_seconds(duration), h.max() as f64 / max_value_divisor // because the Java impl does it this way )?; @@ -345,7 +346,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, /// To get the wrapped `str` back out, use `as_str()` or the `Deref` implementation /// (`&some_tag`). #[derive(Debug, PartialEq, Clone, Copy)] -pub struct Tag<'a>(pub &'a str); +pub struct Tag<'a>(&'a str); impl<'a> Tag<'a> { /// Create a new Tag. @@ -380,6 +381,7 @@ impl<'a> ops::Deref for Tag<'a> { pub struct IntervalLogHistogram<'a> { tag: Option>, start_timestamp: f64, + // lazily map to Duration to save parsing time and a few bytes of space on ILH duration: f64, max: f64, encoded_histogram: &'a str, @@ -401,8 +403,11 @@ impl<'a> IntervalLogHistogram<'a> { } /// Duration of the interval in seconds. - pub fn duration(&self) -> f64 { - self.duration + pub fn duration(&self) -> time::Duration { + let secs = self.duration as u64; + // can't overflow because this can be at most 1 billion which is approx 2^30 + let nsecs = (self.duration.fract() * 1_000_000_000_f64) as u32; + time::Duration::new(secs, nsecs) } /// Max value in the encoded histogram @@ -525,6 +530,10 @@ impl<'a> Iterator for IntervalLogIterator<'a> { } } +fn duration_as_fp_seconds(d: time::Duration) -> f64 { + d.as_secs() as f64 + d.subsec_nanos() as f64 / 1_000_000_000_f64 +} + named!(start_time<&[u8], LogEntry>, do_parse!( tag!("#[StartTime: ") >> diff --git a/src/serialization/interval_log/tests.rs b/src/serialization/interval_log/tests.rs index fead108..4abb744 100644 --- a/src/serialization/interval_log/tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -1,3 +1,5 @@ +use std::time; + use super::super::super::*; use super::super::*; use super::*; @@ -45,7 +47,13 @@ fn write_interval_histo_no_tag() { let mut log_writer = header_writer.into_log_writer(); log_writer - .write_histogram(&h, 1.2345678, 5.67, None, 10.0) + .write_histogram( + &h, + 1.2345678, + time::Duration::new(5, 670_000_000), + None, + 10.0, + ) .unwrap(); } @@ -67,7 +75,13 @@ fn write_interval_histo_with_tag() { let mut log_writer = header_writer.into_log_writer(); log_writer - .write_histogram(&h, 1.234, 5.678, Tag::new("t"), 1.0) + .write_histogram( + &h, + 1.234, + time::Duration::new(5, 678_000_000), + Tag::new("t"), + 1.0, + ) .unwrap(); } diff --git a/src/serialization/rand_varint.rs b/src/serialization/rand_varint.rs index 9e751c9..74b5dbb 100644 --- a/src/serialization/rand_varint.rs +++ b/src/serialization/rand_varint.rs @@ -1,3 +1,6 @@ +//! This is used in tests (both unit tests and integration tests) to provide useful distributions +//! of random numbers. + extern crate rand; use self::rand::Rng; diff --git a/tests/interval_log.rs b/tests/interval_log.rs index 44df4e3..64077cb 100644 --- a/tests/interval_log.rs +++ b/tests/interval_log.rs @@ -12,7 +12,7 @@ mod tests { IntervalLogHistogram, IntervalLogIterator, LogEntry, LogIteratorError, Tag}; - use std::{io, str}; + use std::{io, str, time}; use std::io::{BufRead, Read}; use std::fs::File; use std::path::Path; @@ -97,7 +97,13 @@ mod tests { intervals .iter() .filter(|ilh| ilh.tag().is_none()) - .map(|ilh| { (ilh.start_timestamp(), ilh.duration(), ilh.max()) }) + .map(|ilh| { + ( + ilh.start_timestamp(), + round(duration_as_fp_seconds(ilh.duration())), + ilh.max(), + ) + }) .collect::>() ); @@ -106,7 +112,13 @@ mod tests { intervals .iter() .filter(|ilh| !ilh.tag().is_none()) - .map(|ilh| { (ilh.start_timestamp(), ilh.duration(), ilh.max()) }) + .map(|ilh| { + ( + ilh.start_timestamp(), + round(duration_as_fp_seconds(ilh.duration())), + ilh.max(), + ) + }) .collect::>() ); @@ -241,7 +253,13 @@ mod tests { .map(|s| Tag::new(s.as_str()).unwrap()); writer - .write_histogram(&h, i as f64, (i as f64) + 10000.0, tag, max_scaling_factor) + .write_histogram( + &h, + i as f64, + time::Duration::new(10_000 + i as u64, 0), + tag, + max_scaling_factor, + ) .unwrap(); writer.write_comment(&format!("line {}", i)).unwrap(); @@ -250,10 +268,11 @@ mod tests { } } + println!("{}", ::std::str::from_utf8(&log_buf).unwrap()); + let parsed = IntervalLogIterator::new(&log_buf) - .map(|r| r.unwrap()) .filter_map(|e| match e { - LogEntry::Interval(ilh) => Some(ilh), + Ok(LogEntry::Interval(ilh)) => Some(ilh), _ => None, }) .collect::>(); @@ -272,7 +291,10 @@ mod tests { assert_eq!(original_hist, &decoded_hist); assert_eq!(index as f64, ilh.start_timestamp()); - assert_eq!((index as f64) + 10000.0, ilh.duration()); + assert_eq!( + time::Duration::new(10_000 + index as u64, 0), + ilh.duration() + ); assert_eq!( round(original_hist.max() as f64 / max_scaling_factor), ilh.max() @@ -287,6 +309,10 @@ mod tests { format!("{:.3}", f).parse::().unwrap() } + fn duration_as_fp_seconds(d: time::Duration) -> f64 { + d.as_secs() as f64 + d.subsec_nanos() as f64 / 1_000_000_000_f64 + } + fn load_iterator_from_file<'a>(path: &Path) -> IntervalLogBufHolder { let mut buf = Vec::new(); let _ = File::open(path).unwrap().read_to_end(&mut buf).unwrap(); From 15c14aa827fad25384fcfde2f43b4c2ab873cd24 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Thu, 2 Nov 2017 21:11:54 -0500 Subject: [PATCH 12/18] IntervalLogIterator stops once it encounters a parse error --- src/serialization/interval_log/mod.rs | 10 +++++++++- tests/interval_log.rs | 17 ++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index 46b04c2..f70ce8d 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -94,7 +94,7 @@ //! ``` //! use hdrsample::serialization::interval_log; //! -//! let mut log = "\ +//! let log = "\ //! #I'm a comment\n\ //! Tag=a,0.123,1.007,2.769,base64EncodedHisto\n\ //! 1.456,1.007,2.769,base64EncodedHisto\n\ @@ -486,6 +486,7 @@ pub enum LogIteratorError { pub struct IntervalLogIterator<'a> { orig_len: usize, input: &'a [u8], + ended: bool, } impl<'a> IntervalLogIterator<'a> { @@ -494,6 +495,7 @@ impl<'a> IntervalLogIterator<'a> { IntervalLogIterator { orig_len: input.len(), input, + ended: false, } } } @@ -503,7 +505,12 @@ impl<'a> Iterator for IntervalLogIterator<'a> { fn next(&mut self) -> Option { loop { + if self.ended { + return None; + } + if self.input.is_empty() { + self.ended = true; return None; } @@ -521,6 +528,7 @@ impl<'a> Iterator for IntervalLogIterator<'a> { continue; } _ => { + self.ended = true; return Some(Err(LogIteratorError::ParseError { offset: self.orig_len - self.input.len(), })); diff --git a/tests/interval_log.rs b/tests/interval_log.rs index 64077cb..3551104 100644 --- a/tests/interval_log.rs +++ b/tests/interval_log.rs @@ -30,7 +30,6 @@ mod tests { assert_eq!(1, start_count); } - #[test] fn parse_sample_tagged_interval_log_interval_count() { let data = load_iterator_from_file(Path::new("tests/data/tagged-Log.logV2.hlog")); @@ -268,8 +267,6 @@ mod tests { } } - println!("{}", ::std::str::from_utf8(&log_buf).unwrap()); - let parsed = IntervalLogIterator::new(&log_buf) .filter_map(|e| match e { Ok(LogEntry::Interval(ilh)) => Some(ilh), @@ -304,6 +301,20 @@ mod tests { } } + #[test] + fn parse_interval_log_syntax_error_then_returns_none() { + let log = "#Foo\nBar\n" + .as_bytes(); + + let mut iter = IntervalLogIterator::new(&log); + + assert_eq!( + Some(Err(LogIteratorError::ParseError { offset: 5 })), + iter.next() + ); + assert_eq!(None, iter.next()); + } + /// Round to 3 digits the way floats are in the log fn round(f: f64) -> f64 { format!("{:.3}", f).parse::().unwrap() From a5997fe1e2a3de35a8d26811c5435b1f33c4774f Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Fri, 3 Nov 2017 10:05:50 -0500 Subject: [PATCH 13/18] Use builder style for creating an IntervalLogWriter --- benches/interval_log.rs | 18 ++- src/serialization/interval_log/mod.rs | 165 ++++++++++++++++-------- src/serialization/interval_log/tests.rs | 88 ++++++++----- tests/interval_log.rs | 42 +++--- 4 files changed, 205 insertions(+), 108 deletions(-) diff --git a/benches/interval_log.rs b/benches/interval_log.rs index 77b7ca7..18a61a6 100644 --- a/benches/interval_log.rs +++ b/benches/interval_log.rs @@ -4,6 +4,8 @@ extern crate hdrsample; extern crate rand; extern crate test; +use std::time; + use hdrsample::*; use hdrsample::serialization; use hdrsample::serialization::interval_log; @@ -35,11 +37,13 @@ fn write_interval_log_1k_hist_10k_value(b: &mut Bencher) { b.iter(|| { log.clear(); - let mut writer = - interval_log::IntervalLogHeaderWriter::new(&mut log, &mut serializer).into_log_writer(); + let mut writer = interval_log::IntervalLogWriterBuilder::new() + .build_with(&mut log, &mut serializer) + .unwrap(); + let dur = time::Duration::new(5, 678_000_000); for h in histograms.iter() { - writer.write_histogram(h, 1.234, 5.678, None, 1.0).unwrap(); + writer.write_histogram(h, 1.234, dur, None).unwrap(); } }) } @@ -62,11 +66,13 @@ fn parse_interval_log_1k_hist_10k_value(b: &mut Bencher) { { let mut serializer = serialization::V2Serializer::new(); - let mut writer = - interval_log::IntervalLogHeaderWriter::new(&mut log, &mut serializer).into_log_writer(); + let mut writer = interval_log::IntervalLogWriterBuilder::new() + .build_with(&mut log, &mut serializer) + .unwrap(); + let dur = time::Duration::new(5, 678_000_000); for h in histograms.iter() { - writer.write_histogram(h, 1.234, 5.678, None, 1.0).unwrap(); + writer.write_histogram(h, 1.234, dur, None).unwrap(); } } diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index f70ce8d..c4bef6a 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -136,12 +136,11 @@ //! //! // limit scope of mutable borrow of `buf` //! { -//! let mut header_writer = interval_log::IntervalLogHeaderWriter::new( -//! &mut buf, &mut serializer); -//! header_writer.write_comment("Comments are great").unwrap(); -//! header_writer.write_start_time(123.456789).unwrap(); -//! -//! let mut log_writer = header_writer.into_log_writer(); +//! let mut log_writer = interval_log::IntervalLogWriterBuilder::new() +//! .add_comment("Comments are great") +//! .with_start_time(123.456789) +//! .build_with(&mut buf, &mut serializer) +//! .unwrap(); //! //! log_writer.write_comment( //! "You can have comments anywhere in the log").unwrap(); @@ -151,8 +150,8 @@ //! &h, //! 1.234, //! time::Duration::new(12, 345_678_901), -//! interval_log::Tag::new("im-a-tag"), -//! 1.0) +//! interval_log::Tag::new("im-a-tag") +//! ) //! .unwrap(); //! } //! @@ -170,69 +169,138 @@ use nom::{double, line_ending, not_line_ending, IResult}; use super::super::{Counter, Histogram}; use super::Serializer; -/// Write headers for an interval log. +/// Prepare an `IntervalLogWriter`. /// /// This type only allows writing comments and headers. Once you're done writing those things, use /// `into_log_writer()` to convert this into an `IntervalLogWriter`. -pub struct IntervalLogHeaderWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { - internal_writer: InternalLogWriter<'a, 'b, W, S>, +pub struct IntervalLogWriterBuilder { + comments: Vec, + start_time: Option, + base_time: Option, + max_value_divisor: f64, } -impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogHeaderWriter<'a, 'b, W, S> { +impl IntervalLogWriterBuilder { /// Create a new log writer that writes to `writer` and serializes histograms with `serializer`. - pub fn new(writer: &'a mut W, serializer: &'b mut S) -> IntervalLogHeaderWriter<'a, 'b, W, S> { - IntervalLogHeaderWriter { - internal_writer: InternalLogWriter { - writer, - serializer, - text_buf: String::new(), - serialize_buf: Vec::new(), - }, + pub fn new() -> IntervalLogWriterBuilder { + IntervalLogWriterBuilder { + comments: Vec::new(), + start_time: None, + base_time: None, + max_value_divisor: 1.0, } } - /// Add a comment line. + /// Add a comment line to be written when the writer is built. /// /// If you do silly things like write a comment with a newline character, you'll end up with /// an un-parseable log file. Don't do that. - pub fn write_comment(&mut self, s: &str) -> io::Result<()> { - self.internal_writer.write_comment(s) + pub fn add_comment(&mut self, s: &str) -> &mut Self { + self.comments.push(s.to_owned()); + self } - /// Write a StartTime log line. See the module-level documentation for more info. + /// Set a StartTime. See the module-level documentation for more info. /// - /// This should only be called once to avoid creating a confusing interval log. - pub fn write_start_time(&mut self, seconds_since_epoch: f64) -> io::Result<()> { - self.internal_writer.write_fmt(format_args!( - "#[StartTime: {:.3} (seconds since epoch)]\n", - seconds_since_epoch - )) + /// This can be called multiple times, but only the value for the most recent invocation will + /// be written. + pub fn with_start_time(&mut self, seconds_since_epoch: f64) -> &mut Self { + self.start_time = Some(seconds_since_epoch); + self } - /// Write a BaseTime log line. See the module-level documentation for more info. + /// Set a BaseTime. See the module-level documentation for more info. /// - /// This should only be called once to avoid creating a confusing interval log. - pub fn write_base_time(&mut self, seconds_since_epoch: f64) -> io::Result<()> { - self.internal_writer.write_fmt(format_args!( - "#[BaseTime: {:.3} (seconds since epoch)]\n", - seconds_since_epoch - )) + /// This can be called multiple times, but only the value for the most recent invocation will + /// be written. + pub fn with_base_time(&mut self, seconds_since_epoch: f64) -> &mut Self { + self.base_time = Some(seconds_since_epoch); + self } - /// Once you're finished with headers, convert this into a log writer so you can write interval - /// histograms. - pub fn into_log_writer(self) -> IntervalLogWriter<'a, 'b, W, S> { - IntervalLogWriter { - internal_writer: self.internal_writer, + /// Set a max value divisor. + /// + /// This is used to scale down the max value part of an interval log to something that may be + /// more human readable. The max value in the log is only for human consumption, so you might + /// prefer to divide by 109 to turn nanoseconds into fractional seconds, for + /// instance. + /// + /// If this is not set, 1.0 will be used. + /// + /// This can be called multiple times, but only the value for the most recent invocation will + /// be written. + pub fn with_max_value_divisor(&mut self, max_value_divisor: f64) -> &mut Self { + self.max_value_divisor = max_value_divisor; + self + } + + /// Build a LogWriter and apply any configured headers. + pub fn build_with<'a, 'b, W: 'a + io::Write, S: 'b + Serializer>( + &self, + writer: &'a mut W, + serializer: &'b mut S, + ) -> Result, io::Error> { + let mut internal_writer = InternalLogWriter { + writer, + serializer, + text_buf: String::new(), + serialize_buf: Vec::new(), + max_value_divisor: self.max_value_divisor, + }; + + for c in self.comments.iter() { + internal_writer.write_comment(&c)?; + } + + if let Some(st) = self.start_time { + internal_writer.write_fmt(format_args!( + "#[StartTime: {:.3} (seconds since epoch)]\n", + st + ))?; + } + + if let Some(bt) = self.base_time { + internal_writer.write_fmt(format_args!( + "#[BaseTime: {:.3} (seconds since epoch)]\n", + bt + ))?; } + + // The Java impl doesn't write a comment for this but it's confusing to silently modify the + // max value without leaving a trace + if self.max_value_divisor != 1.0_f64 { + internal_writer.write_fmt(format_args!( + "#[MaxValueDivisor: {:.3}]\n", + self.max_value_divisor + ))?; + } + + Ok(IntervalLogWriter { internal_writer }) } } /// Writes interval histograms in an interval log. /// -/// This isn't created directly; start with an `IntervalLogHeaderWriter`. Once you've written the +/// This isn't created directly; start with an `IntervalLogWriterBuilder`. Once you've written the /// headers and ended up with an `IntervalLogWriter`, typical usage would be to write a histogram /// at regular intervals (e.g. once a second). +/// +/// ``` +/// use hdrsample::serialization; +/// use hdrsample::serialization::interval_log; +/// +/// let mut buf = Vec::new(); +/// let mut serializer = serialization::V2Serializer::new(); +/// +/// // create a writer via a builder +/// let mut writer = interval_log::IntervalLogWriterBuilder::new() +/// .build_with(&mut buf, &mut serializer) +/// .unwrap(); +/// +/// writer.write_comment("Comment 2").unwrap(); +/// +/// // .. write some intervals +/// ``` pub struct IntervalLogWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { internal_writer: InternalLogWriter<'a, 'b, W, S>, } @@ -253,20 +321,15 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogWriter<'a, 'b, W, /// `duration` is the duration of the interval in seconds. /// /// `tag` is an optional tag for this histogram. - /// - /// `max_value_divisor` is used to scale down the max value to something that may be more human - /// readable. The max value in the log is only for human consumption, so you might prefer to - /// divide by 109 to turn nanoseconds into fractional seconds, for instance. pub fn write_histogram( &mut self, h: &Histogram, start_timestamp: f64, duration: time::Duration, tag: Option, - max_value_divisor: f64, ) -> Result<(), IntervalLogWriterError> { self.internal_writer - .write_histogram(h, start_timestamp, duration, tag, max_value_divisor) + .write_histogram(h, start_timestamp, duration, tag) } } @@ -291,6 +354,7 @@ struct InternalLogWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { serializer: &'b mut S, text_buf: String, serialize_buf: Vec, + max_value_divisor: f64, } impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, S> { @@ -308,7 +372,6 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, start_timestamp: f64, duration: time::Duration, tag: Option, - max_value_divisor: f64, ) -> Result<(), IntervalLogWriterError> { self.serialize_buf.clear(); self.text_buf.clear(); @@ -323,7 +386,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, self.text_buf, start_timestamp, duration_as_fp_seconds(duration), - h.max() as f64 / max_value_divisor // because the Java impl does it this way + h.max() as f64 / self.max_value_divisor // because the Java impl does it this way )?; self.text_buf.clear(); diff --git a/src/serialization/interval_log/tests.rs b/src/serialization/interval_log/tests.rs index 4abb744..10f1ae3 100644 --- a/src/serialization/interval_log/tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -9,11 +9,10 @@ fn write_header_comment() { let mut buf = Vec::new(); let mut serializer = V2Serializer::new(); - { - let mut header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); - - header_writer.write_comment("foo").unwrap(); - } + let _ = IntervalLogWriterBuilder::new() + .add_comment("foo") + .build_with(&mut buf, &mut serializer) + .unwrap(); assert_eq!(&b"#foo\n"[..], &buf[..]); } @@ -24,14 +23,40 @@ fn write_header_then_interval_comment() { let mut serializer = V2Serializer::new(); { - let mut header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); - header_writer.write_comment("foo").unwrap(); - let mut log_writer = header_writer.into_log_writer(); + let mut log_writer = IntervalLogWriterBuilder::new() + .add_comment("foo") + .add_comment("bar") + .build_with(&mut buf, &mut serializer) + .unwrap(); + log_writer.write_comment("baz").unwrap(); + } + + assert_eq!("#foo\n#bar\n#baz\n", str::from_utf8(&buf[..]).unwrap()); +} - log_writer.write_comment("bar").unwrap(); +#[test] +fn write_headers_multiple_times_only_last_is_used() { + let mut buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + { + let _ = IntervalLogWriterBuilder::new() + .with_start_time(10.0) + .with_base_time(20.0) + .with_start_time(100.0) + .with_base_time(200.0) + .with_max_value_divisor(1_000.0) + .with_max_value_divisor(1_000_000.0) + .build_with(&mut buf, &mut serializer) + .unwrap(); } - assert_eq!("#foo\n#bar\n", str::from_utf8(&buf[..]).unwrap()); + let expected = "\ + #[StartTime: 100.000 (seconds since epoch)]\n\ + #[BaseTime: 200.000 (seconds since epoch)]\n\ + #[MaxValueDivisor: 1000000.000]\n"; + + assert_eq!(expected, str::from_utf8(&buf[..]).unwrap()); } #[test] @@ -43,24 +68,21 @@ fn write_interval_histo_no_tag() { h.record(1000).unwrap(); { - let header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); - let mut log_writer = header_writer.into_log_writer(); + let mut log_writer = IntervalLogWriterBuilder::new() + .with_max_value_divisor(10.0) + .build_with(&mut buf, &mut serializer) + .unwrap(); log_writer - .write_histogram( - &h, - 1.2345678, - time::Duration::new(5, 670_000_000), - None, - 10.0, - ) + .write_histogram(&h, 1.2345678, time::Duration::new(5, 670_000_000), None) .unwrap(); } - assert_eq!( - "1.235,5.670,100.000,HISTEwAAAAMAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAM8PAg==\n", - str::from_utf8(&buf[..]).unwrap() - ); + let expected = "\ + #[MaxValueDivisor: 10.000]\n\ + 1.235,5.670,100.000,HISTEwAAAAMAAAAAAAAAAwAAAAAAAAAB//////////8/8AAAAAAAAM8PAg==\n"; + + assert_eq!(expected, str::from_utf8(&buf[..]).unwrap()); } #[test] @@ -71,8 +93,9 @@ fn write_interval_histo_with_tag() { let h = Histogram::::new_with_bounds(1, u64::max_value(), 3).unwrap(); { - let header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); - let mut log_writer = header_writer.into_log_writer(); + let mut log_writer = IntervalLogWriterBuilder::new() + .build_with(&mut buf, &mut serializer) + .unwrap(); log_writer .write_histogram( @@ -80,7 +103,6 @@ fn write_interval_histo_with_tag() { 1.234, time::Duration::new(5, 678_000_000), Tag::new("t"), - 1.0, ) .unwrap(); } @@ -96,10 +118,10 @@ fn write_start_time() { let mut buf = Vec::new(); let mut serializer = V2Serializer::new(); - { - let mut header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); - header_writer.write_start_time(123.456789).unwrap(); - } + let _ = IntervalLogWriterBuilder::new() + .with_start_time(123.456789) + .build_with(&mut buf, &mut serializer) + .unwrap(); assert_eq!( "#[StartTime: 123.457 (seconds since epoch)]\n", @@ -113,8 +135,10 @@ fn write_base_time() { let mut serializer = V2Serializer::new(); { - let mut header_writer = IntervalLogHeaderWriter::new(&mut buf, &mut serializer); - header_writer.write_base_time(123.456789).unwrap(); + let _ = IntervalLogWriterBuilder::new() + .with_base_time(123.456789) + .build_with(&mut buf, &mut serializer) + .unwrap(); } assert_eq!( diff --git a/tests/interval_log.rs b/tests/interval_log.rs index 3551104..34e289c 100644 --- a/tests/interval_log.rs +++ b/tests/interval_log.rs @@ -8,11 +8,11 @@ mod tests { use self::hdrsample::Histogram; use self::hdrsample::serialization::{Deserializer, Serializer, V2Serializer}; - use self::hdrsample::serialization::interval_log::{IntervalLogHeaderWriter, - IntervalLogHistogram, IntervalLogIterator, - LogEntry, LogIteratorError, Tag}; + use self::hdrsample::serialization::interval_log::{IntervalLogHistogram, IntervalLogIterator, + IntervalLogWriterBuilder, LogEntry, + LogIteratorError, Tag}; - use std::{io, str, time}; + use std::{io, iter, str, time}; use std::io::{BufRead, Read}; use std::fs::File; use std::path::Path; @@ -178,8 +178,10 @@ mod tests { let mut duplicate_log = Vec::new(); { - let mut writer = - IntervalLogHeaderWriter::new(&mut duplicate_log, &mut serializer).into_log_writer(); + let mut writer = IntervalLogWriterBuilder::new() + .with_max_value_divisor(1_000_000.0) + .build_with(&mut duplicate_log, &mut serializer) + .unwrap(); IntervalLogIterator::new(&log_without_headers) .map(|r| r.unwrap()) @@ -200,7 +202,6 @@ mod tests { ilh.start_timestamp(), ilh.duration(), ilh.tag(), - 1_000_000.0, ) .unwrap(); }); @@ -208,7 +209,15 @@ mod tests { let orig_str = str::from_utf8(&log_without_headers).unwrap(); - let rewritten_str = str::from_utf8(&duplicate_log).unwrap(); + let rewritten_str = str::from_utf8(&duplicate_log) + .unwrap() + .lines() + // remove our #[MaxValueDivisor] comment + .filter(|l| !l.starts_with("#[MaxValueDivisor: ")) + // put newlines back in + .flat_map(|l| iter::once(l).chain(iter::once("\n"))) + .collect::(); + assert_eq!(orig_str, rewritten_str); } @@ -226,8 +235,10 @@ mod tests { let max_scaling_factor = 1_000_000.0; { - let mut writer = - IntervalLogHeaderWriter::new(&mut log_buf, &mut serializer).into_log_writer(); + let mut writer = IntervalLogWriterBuilder::new() + .with_max_value_divisor(max_scaling_factor) + .build_with(&mut log_buf, &mut serializer) + .unwrap(); writer.write_comment("start").unwrap(); @@ -252,13 +263,7 @@ mod tests { .map(|s| Tag::new(s.as_str()).unwrap()); writer - .write_histogram( - &h, - i as f64, - time::Duration::new(10_000 + i as u64, 0), - tag, - max_scaling_factor, - ) + .write_histogram(&h, i as f64, time::Duration::new(10_000 + i as u64, 0), tag) .unwrap(); writer.write_comment(&format!("line {}", i)).unwrap(); @@ -303,8 +308,7 @@ mod tests { #[test] fn parse_interval_log_syntax_error_then_returns_none() { - let log = "#Foo\nBar\n" - .as_bytes(); + let log = "#Foo\nBar\n".as_bytes(); let mut iter = IntervalLogIterator::new(&log); From 046b595aa64c486af659c53de4cba60f6a00165b Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Fri, 3 Nov 2017 15:13:24 -0500 Subject: [PATCH 14/18] Use `SystemTime` for start and base time --- benches/interval_log.rs | 4 ++-- src/serialization/interval_log/mod.rs | 25 +++++++++++++------- src/serialization/interval_log/tests.rs | 31 ++++++++++++++----------- tests/interval_log.rs | 4 ++-- 4 files changed, 39 insertions(+), 25 deletions(-) diff --git a/benches/interval_log.rs b/benches/interval_log.rs index 18a61a6..2c6dece 100644 --- a/benches/interval_log.rs +++ b/benches/interval_log.rs @@ -38,7 +38,7 @@ fn write_interval_log_1k_hist_10k_value(b: &mut Bencher) { log.clear(); let mut writer = interval_log::IntervalLogWriterBuilder::new() - .build_with(&mut log, &mut serializer) + .begin_log_with(&mut log, &mut serializer) .unwrap(); let dur = time::Duration::new(5, 678_000_000); @@ -67,7 +67,7 @@ fn parse_interval_log_1k_hist_10k_value(b: &mut Bencher) { { let mut serializer = serialization::V2Serializer::new(); let mut writer = interval_log::IntervalLogWriterBuilder::new() - .build_with(&mut log, &mut serializer) + .begin_log_with(&mut log, &mut serializer) .unwrap(); let dur = time::Duration::new(5, 678_000_000); diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index c4bef6a..29b3d68 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -138,8 +138,8 @@ //! { //! let mut log_writer = interval_log::IntervalLogWriterBuilder::new() //! .add_comment("Comments are great") -//! .with_start_time(123.456789) -//! .build_with(&mut buf, &mut serializer) +//! .with_start_time(time::SystemTime::now()) +//! .begin_log_with(&mut buf, &mut serializer) //! .unwrap(); //! //! log_writer.write_comment( @@ -204,8 +204,8 @@ impl IntervalLogWriterBuilder { /// /// This can be called multiple times, but only the value for the most recent invocation will /// be written. - pub fn with_start_time(&mut self, seconds_since_epoch: f64) -> &mut Self { - self.start_time = Some(seconds_since_epoch); + pub fn with_start_time(&mut self, time: time::SystemTime) -> &mut Self { + self.start_time = Some(system_time_as_fp_seconds(time)); self } @@ -213,8 +213,8 @@ impl IntervalLogWriterBuilder { /// /// This can be called multiple times, but only the value for the most recent invocation will /// be written. - pub fn with_base_time(&mut self, seconds_since_epoch: f64) -> &mut Self { - self.base_time = Some(seconds_since_epoch); + pub fn with_base_time(&mut self, time: time::SystemTime) -> &mut Self { + self.base_time = Some(system_time_as_fp_seconds(time)); self } @@ -235,7 +235,7 @@ impl IntervalLogWriterBuilder { } /// Build a LogWriter and apply any configured headers. - pub fn build_with<'a, 'b, W: 'a + io::Write, S: 'b + Serializer>( + pub fn begin_log_with<'a, 'b, W: 'a + io::Write, S: 'b + Serializer>( &self, writer: &'a mut W, serializer: &'b mut S, @@ -294,7 +294,7 @@ impl IntervalLogWriterBuilder { /// /// // create a writer via a builder /// let mut writer = interval_log::IntervalLogWriterBuilder::new() -/// .build_with(&mut buf, &mut serializer) +/// .begin_log_with(&mut buf, &mut serializer) /// .unwrap(); /// /// writer.write_comment("Comment 2").unwrap(); @@ -605,6 +605,15 @@ fn duration_as_fp_seconds(d: time::Duration) -> f64 { d.as_secs() as f64 + d.subsec_nanos() as f64 / 1_000_000_000_f64 } +fn system_time_as_fp_seconds(time: time::SystemTime) -> f64 { + match time.duration_since(time::UNIX_EPOCH) { + Ok(dur_after_epoch) => duration_as_fp_seconds(dur_after_epoch), + // Doesn't seem possible to be before the epoch, but using a negative number seems like + // a reasonable representation if it does occur + Err(t) => duration_as_fp_seconds(t.duration()) * -1_f64, + } +} + named!(start_time<&[u8], LogEntry>, do_parse!( tag!("#[StartTime: ") >> diff --git a/src/serialization/interval_log/tests.rs b/src/serialization/interval_log/tests.rs index 10f1ae3..436cd80 100644 --- a/src/serialization/interval_log/tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -1,4 +1,5 @@ use std::time; +use std::ops::Add; use super::super::super::*; use super::super::*; @@ -11,7 +12,7 @@ fn write_header_comment() { let _ = IntervalLogWriterBuilder::new() .add_comment("foo") - .build_with(&mut buf, &mut serializer) + .begin_log_with(&mut buf, &mut serializer) .unwrap(); assert_eq!(&b"#foo\n"[..], &buf[..]); @@ -26,7 +27,7 @@ fn write_header_then_interval_comment() { let mut log_writer = IntervalLogWriterBuilder::new() .add_comment("foo") .add_comment("bar") - .build_with(&mut buf, &mut serializer) + .begin_log_with(&mut buf, &mut serializer) .unwrap(); log_writer.write_comment("baz").unwrap(); } @@ -41,13 +42,13 @@ fn write_headers_multiple_times_only_last_is_used() { { let _ = IntervalLogWriterBuilder::new() - .with_start_time(10.0) - .with_base_time(20.0) - .with_start_time(100.0) - .with_base_time(200.0) + .with_start_time(system_time_after_epoch(10, 0)) + .with_base_time(system_time_after_epoch(20, 0)) + .with_start_time(system_time_after_epoch(100, 0)) + .with_base_time(system_time_after_epoch(200, 0)) .with_max_value_divisor(1_000.0) .with_max_value_divisor(1_000_000.0) - .build_with(&mut buf, &mut serializer) + .begin_log_with(&mut buf, &mut serializer) .unwrap(); } @@ -70,7 +71,7 @@ fn write_interval_histo_no_tag() { { let mut log_writer = IntervalLogWriterBuilder::new() .with_max_value_divisor(10.0) - .build_with(&mut buf, &mut serializer) + .begin_log_with(&mut buf, &mut serializer) .unwrap(); log_writer @@ -94,7 +95,7 @@ fn write_interval_histo_with_tag() { { let mut log_writer = IntervalLogWriterBuilder::new() - .build_with(&mut buf, &mut serializer) + .begin_log_with(&mut buf, &mut serializer) .unwrap(); log_writer @@ -119,8 +120,8 @@ fn write_start_time() { let mut serializer = V2Serializer::new(); let _ = IntervalLogWriterBuilder::new() - .with_start_time(123.456789) - .build_with(&mut buf, &mut serializer) + .with_start_time(system_time_after_epoch(123, 456_789_012)) + .begin_log_with(&mut buf, &mut serializer) .unwrap(); assert_eq!( @@ -136,8 +137,8 @@ fn write_base_time() { { let _ = IntervalLogWriterBuilder::new() - .with_base_time(123.456789) - .build_with(&mut buf, &mut serializer) + .with_base_time(system_time_after_epoch(123, 456_789_012)) + .begin_log_with(&mut buf, &mut serializer) .unwrap(); } @@ -317,3 +318,7 @@ fn iter_all_ignored_empty_iter() { assert_eq!(0, IntervalLogIterator::new(&data).count()); } + +fn system_time_after_epoch(secs: u64, nanos: u32) -> time::SystemTime { + time::UNIX_EPOCH.add(time::Duration::new(secs, nanos)) +} diff --git a/tests/interval_log.rs b/tests/interval_log.rs index 34e289c..39b93b7 100644 --- a/tests/interval_log.rs +++ b/tests/interval_log.rs @@ -180,7 +180,7 @@ mod tests { { let mut writer = IntervalLogWriterBuilder::new() .with_max_value_divisor(1_000_000.0) - .build_with(&mut duplicate_log, &mut serializer) + .begin_log_with(&mut duplicate_log, &mut serializer) .unwrap(); IntervalLogIterator::new(&log_without_headers) @@ -237,7 +237,7 @@ mod tests { { let mut writer = IntervalLogWriterBuilder::new() .with_max_value_divisor(max_scaling_factor) - .build_with(&mut log_buf, &mut serializer) + .begin_log_with(&mut log_buf, &mut serializer) .unwrap(); writer.write_comment("start").unwrap(); From 4d35a9e20903d3d21cd95aec05024b7d1c62e6be Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Fri, 3 Nov 2017 15:38:50 -0500 Subject: [PATCH 15/18] Use Duration for per-interval start timestamp --- benches/interval_log.rs | 4 +-- src/serialization/interval_log/mod.rs | 39 +++++++++++++++---------- src/serialization/interval_log/tests.rs | 9 ++++-- tests/interval_log.rs | 13 ++++++--- 4 files changed, 41 insertions(+), 24 deletions(-) diff --git a/benches/interval_log.rs b/benches/interval_log.rs index 2c6dece..9b2b29a 100644 --- a/benches/interval_log.rs +++ b/benches/interval_log.rs @@ -43,7 +43,7 @@ fn write_interval_log_1k_hist_10k_value(b: &mut Bencher) { let dur = time::Duration::new(5, 678_000_000); for h in histograms.iter() { - writer.write_histogram(h, 1.234, dur, None).unwrap(); + writer.write_histogram(h, time::Duration::new(1, 234_000_000), dur, None).unwrap(); } }) } @@ -72,7 +72,7 @@ fn parse_interval_log_1k_hist_10k_value(b: &mut Bencher) { let dur = time::Duration::new(5, 678_000_000); for h in histograms.iter() { - writer.write_histogram(h, 1.234, dur, None).unwrap(); + writer.write_histogram(h, time::Duration::new(1, 234_000_000), dur, None).unwrap(); } } diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index 29b3d68..6ebc4b0 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -70,6 +70,7 @@ //! Parse a single interval from a log. //! //! ``` +//! use std::time; //! use hdrsample::serialization::interval_log; //! //! // two newline-separated log lines: a comment, then an interval @@ -78,9 +79,9 @@ //! let mut iter = interval_log::IntervalLogIterator::new(&log[..]); //! //! // the comment is consumed and ignored by the parser, so the first event is an Interval -//! match iter.next().unwrap().unwrap() { -//! interval_log::LogEntry::Interval(h) => { -//! assert_eq!(0.127, h.start_timestamp()); +//! match iter.next().unwrap() { +//! Ok(interval_log::LogEntry::Interval(h)) => { +//! assert_eq!(time::Duration::new(0, 127_000_000), h.start_timestamp()); //! } //! _ => panic!() //! } @@ -113,7 +114,7 @@ //! _ => None //! }) //! // do any filtering you want -//! .filter(|ilh| ilh.start_timestamp() >= 3.0) +//! .filter(|ilh| ilh.start_timestamp().as_secs() >= 3) //! .count(); //! //! assert_eq!(3, count); @@ -136,9 +137,10 @@ //! //! // limit scope of mutable borrow of `buf` //! { +//! let now = time::SystemTime::now(); //! let mut log_writer = interval_log::IntervalLogWriterBuilder::new() //! .add_comment("Comments are great") -//! .with_start_time(time::SystemTime::now()) +//! .with_start_time(now) //! .begin_log_with(&mut buf, &mut serializer) //! .unwrap(); //! @@ -148,7 +150,7 @@ //! log_writer //! .write_histogram( //! &h, -//! 1.234, +//! now.elapsed().unwrap(), //! time::Duration::new(12, 345_678_901), //! interval_log::Tag::new("im-a-tag") //! ) @@ -324,7 +326,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogWriter<'a, 'b, W, pub fn write_histogram( &mut self, h: &Histogram, - start_timestamp: f64, + start_timestamp: time::Duration, duration: time::Duration, tag: Option, ) -> Result<(), IntervalLogWriterError> { @@ -369,7 +371,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, fn write_histogram( &mut self, h: &Histogram, - start_timestamp: f64, + start_timestamp: time::Duration, duration: time::Duration, tag: Option, ) -> Result<(), IntervalLogWriterError> { @@ -384,7 +386,7 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, self.writer, "{}{:.3},{:.3},{:.3},", self.text_buf, - start_timestamp, + duration_as_fp_seconds(start_timestamp), duration_as_fp_seconds(duration), h.max() as f64 / self.max_value_divisor // because the Java impl does it this way )?; @@ -456,21 +458,19 @@ impl<'a> IntervalLogHistogram<'a> { self.tag } - /// Timestamp of the start of the interval in seconds. + /// Timestamp of the start of the interval in seconds, expressed as a `Duration` relative to + /// some start point. /// /// The timestamp may be absolute vs the epoch, or there may be a `StartTime` or `BaseTime` for /// the log, in which case you may wish to consider this number as a delta vs those timestamps. /// See the module-level documentation about timestamps. - pub fn start_timestamp(&self) -> f64 { - self.start_timestamp + pub fn start_timestamp(&self) -> time::Duration { + fp_seconds_as_duration(self.start_timestamp) } /// Duration of the interval in seconds. pub fn duration(&self) -> time::Duration { - let secs = self.duration as u64; - // can't overflow because this can be at most 1 billion which is approx 2^30 - let nsecs = (self.duration.fract() * 1_000_000_000_f64) as u32; - time::Duration::new(secs, nsecs) + fp_seconds_as_duration(self.duration) } /// Max value in the encoded histogram @@ -601,6 +601,13 @@ impl<'a> Iterator for IntervalLogIterator<'a> { } } +fn fp_seconds_as_duration(fp_secs: f64) -> time::Duration { + let secs = fp_secs as u64; + // can't overflow because this can be at most 1 billion which is approx 2^30 + let nsecs = (fp_secs.fract() * 1_000_000_000_f64) as u32; + time::Duration::new(secs, nsecs) +} + fn duration_as_fp_seconds(d: time::Duration) -> f64 { d.as_secs() as f64 + d.subsec_nanos() as f64 / 1_000_000_000_f64 } diff --git a/src/serialization/interval_log/tests.rs b/src/serialization/interval_log/tests.rs index 436cd80..a591612 100644 --- a/src/serialization/interval_log/tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -75,7 +75,12 @@ fn write_interval_histo_no_tag() { .unwrap(); log_writer - .write_histogram(&h, 1.2345678, time::Duration::new(5, 670_000_000), None) + .write_histogram( + &h, + time::Duration::new(1, 234_567_890), + time::Duration::new(5, 670_000_000), + None, + ) .unwrap(); } @@ -101,7 +106,7 @@ fn write_interval_histo_with_tag() { log_writer .write_histogram( &h, - 1.234, + time::Duration::new(1, 234_000_000), time::Duration::new(5, 678_000_000), Tag::new("t"), ) diff --git a/tests/interval_log.rs b/tests/interval_log.rs index 39b93b7..e59c9e9 100644 --- a/tests/interval_log.rs +++ b/tests/interval_log.rs @@ -98,7 +98,7 @@ mod tests { .filter(|ilh| ilh.tag().is_none()) .map(|ilh| { ( - ilh.start_timestamp(), + round(duration_as_fp_seconds(ilh.start_timestamp())), round(duration_as_fp_seconds(ilh.duration())), ilh.max(), ) @@ -113,7 +113,7 @@ mod tests { .filter(|ilh| !ilh.tag().is_none()) .map(|ilh| { ( - ilh.start_timestamp(), + round(duration_as_fp_seconds(ilh.start_timestamp())), round(duration_as_fp_seconds(ilh.duration())), ilh.max(), ) @@ -263,7 +263,12 @@ mod tests { .map(|s| Tag::new(s.as_str()).unwrap()); writer - .write_histogram(&h, i as f64, time::Duration::new(10_000 + i as u64, 0), tag) + .write_histogram( + &h, + time::Duration::from_secs(i as u64), + time::Duration::new(10_000 + i as u64, 0), + tag, + ) .unwrap(); writer.write_comment(&format!("line {}", i)).unwrap(); @@ -292,7 +297,7 @@ mod tests { assert_eq!(original_hist, &decoded_hist); - assert_eq!(index as f64, ilh.start_timestamp()); + assert_eq!(index as u64, ilh.start_timestamp().as_secs()); assert_eq!( time::Duration::new(10_000 + index as u64, 0), ilh.duration() From 2e247693c7e1ef3ed290267a35abc8392c20da61 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Fri, 3 Nov 2017 16:53:04 -0500 Subject: [PATCH 16/18] Apply rustfmt, and appease line length warnings --- benches/interval_log.rs | 8 ++++++-- src/iterators/linear.rs | 6 +++--- src/iterators/log.rs | 6 +++--- src/lib.rs | 3 ++- src/serialization/v2_serializer.rs | 4 +++- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/benches/interval_log.rs b/benches/interval_log.rs index 9b2b29a..2984e80 100644 --- a/benches/interval_log.rs +++ b/benches/interval_log.rs @@ -43,7 +43,9 @@ fn write_interval_log_1k_hist_10k_value(b: &mut Bencher) { let dur = time::Duration::new(5, 678_000_000); for h in histograms.iter() { - writer.write_histogram(h, time::Duration::new(1, 234_000_000), dur, None).unwrap(); + writer + .write_histogram(h, time::Duration::new(1, 234_000_000), dur, None) + .unwrap(); } }) } @@ -72,7 +74,9 @@ fn parse_interval_log_1k_hist_10k_value(b: &mut Bencher) { let dur = time::Duration::new(5, 678_000_000); for h in histograms.iter() { - writer.write_histogram(h, time::Duration::new(1, 234_000_000), dur, None).unwrap(); + writer + .write_histogram(h, time::Duration::new(1, 234_000_000), dur, None) + .unwrap(); } } diff --git a/src/iterators/linear.rs b/src/iterators/linear.rs index efa5f50..b433c3d 100644 --- a/src/iterators/linear.rs +++ b/src/iterators/linear.rs @@ -22,6 +22,8 @@ impl<'a, T: 'a + Counter> Iter<'a, T> { value_units_per_bucket > 0, "value_units_per_bucket must be > 0" ); + + let new_lowest = hist.lowest_equivalent(value_units_per_bucket - 1); HistogramIterator::new( hist, Iter { @@ -29,9 +31,7 @@ impl<'a, T: 'a + Counter> Iter<'a, T> { value_units_per_bucket, // won't underflow because value_units_per_bucket > 0 current_step_highest_value_reporting_level: value_units_per_bucket - 1, - current_step_lowest_value_reporting_level: hist.lowest_equivalent( - value_units_per_bucket - 1, - ), + current_step_lowest_value_reporting_level: new_lowest, }, ) } diff --git a/src/iterators/log.rs b/src/iterators/log.rs index 543d6b6..42a226c 100644 --- a/src/iterators/log.rs +++ b/src/iterators/log.rs @@ -27,6 +27,8 @@ impl<'a, T: 'a + Counter> Iter<'a, T> { "value_units_per_bucket must be > 0" ); assert!(log_base > 1.0, "log_base must be > 1.0"); + + let new_lowest = hist.lowest_equivalent(value_units_in_first_bucket - 1); HistogramIterator::new( hist, Iter { @@ -34,9 +36,7 @@ impl<'a, T: 'a + Counter> Iter<'a, T> { log_base, next_value_reporting_level: value_units_in_first_bucket as f64, current_step_highest_value_reporting_level: value_units_in_first_bucket - 1, - current_step_lowest_value_reporting_level: hist.lowest_equivalent( - value_units_in_first_bucket - 1, - ), + current_step_lowest_value_reporting_level: new_lowest, }, ) } diff --git a/src/lib.rs b/src/lib.rs index 9ab2df3..25a8d59 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -780,7 +780,8 @@ impl Histogram { }; // Already checked that high >= 2*low - h.resize(high).map_err(|_| CreationError::UsizeTypeTooSmall)?; + h.resize(high) + .map_err(|_| CreationError::UsizeTypeTooSmall)?; Ok(h) } diff --git a/src/serialization/v2_serializer.rs b/src/serialization/v2_serializer.rs index 91bc38e..f384ba1 100644 --- a/src/serialization/v2_serializer.rs +++ b/src/serialization/v2_serializer.rs @@ -149,7 +149,9 @@ pub fn encode_counts( // serializing. Don't want to silently eat counts beyond i63 max when serializing. // Perhaps we should provide some sort of pluggability here -- choose whether you want // to truncate counts to i63 max, or report errors if you need maximum fidelity? - count.to_i64().ok_or(V2SerializeError::CountNotSerializable)? + count + .to_i64() + .ok_or(V2SerializeError::CountNotSerializable)? }; let zz = zig_zag_encode(count_or_zeros); From bba89704718047e0e29f133e75b7970fb8caba94 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Fri, 3 Nov 2017 19:02:15 -0500 Subject: [PATCH 17/18] Turn comments with newlines into multiple comment lines --- src/serialization/interval_log/mod.rs | 32 +++++++------ src/serialization/interval_log/tests.rs | 63 ++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index 6ebc4b0..6472e6b 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -166,7 +166,7 @@ extern crate base64; use std::{fmt, io, ops, str, time}; use std::fmt::Write; -use nom::{double, line_ending, not_line_ending, IResult}; +use nom::{double, IResult}; use super::super::{Counter, Histogram}; use super::Serializer; @@ -195,8 +195,7 @@ impl IntervalLogWriterBuilder { /// Add a comment line to be written when the writer is built. /// - /// If you do silly things like write a comment with a newline character, you'll end up with - /// an un-parseable log file. Don't do that. + /// Comments containing '\n' will be transformed into multiple lines of comments. pub fn add_comment(&mut self, s: &str) -> &mut Self { self.comments.push(s.to_owned()); self @@ -250,7 +249,7 @@ impl IntervalLogWriterBuilder { max_value_divisor: self.max_value_divisor, }; - for c in self.comments.iter() { + for c in &self.comments { internal_writer.write_comment(&c)?; } @@ -308,7 +307,9 @@ pub struct IntervalLogWriter<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> { } impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> IntervalLogWriter<'a, 'b, W, S> { - /// Add a comment line. + /// Write a comment line. + /// + /// Comments containing '\n' will be transformed into multiple lines of comments. pub fn write_comment(&mut self, s: &str) -> io::Result<()> { self.internal_writer.write_comment(s) } @@ -365,7 +366,11 @@ impl<'a, 'b, W: 'a + io::Write, S: 'b + Serializer> InternalLogWriter<'a, 'b, W, } fn write_comment(&mut self, s: &str) -> io::Result<()> { - write!(self.writer, "#{}\n", s) + for l in s.split('\n') { + write!(self.writer, "#{}\n", l)?; + } + + Ok(()) } fn write_histogram( @@ -525,7 +530,7 @@ pub enum LogIteratorError { /// /// Because histogram deserialization is deferred, parsing logs is fast. See the `interval_log` /// benchmark if you wish to see how it does on your hardware. As a baseline, parsing a log of 1000 -/// random histograms of 10,000 values each takes 8ms total on an E5-1650v3. +/// random histograms of 10,000 values each takes 2ms total on an E5-1650v3. /// /// Deferring deserialization is handy because it allows you to cheaply navigate the log to find /// the records you care about (e.g. ones in a certain time range, or with a certain tag) without @@ -626,8 +631,7 @@ named!(start_time<&[u8], LogEntry>, tag!("#[StartTime: ") >> n: double >> char!(' ') >> - not_line_ending >> - line_ending >> + take_until_and_consume!("\n") >> (LogEntry::StartTime(n)) )); @@ -636,8 +640,7 @@ named!(base_time<&[u8], LogEntry>, tag!("#[BaseTime: ") >> n: double >> char!(' ') >> - not_line_ending >> - line_ending >> + take_until_and_consume!("\n") >> (LogEntry::BaseTime(n)) )); @@ -655,8 +658,7 @@ named!(interval_hist<&[u8], LogEntry>, char!(',') >> max: double >> char!(',') >> - encoded_histogram: map_res!(not_line_ending, str::from_utf8) >> - line_ending >> + encoded_histogram: map_res!(take_until_and_consume!("\n"), str::from_utf8) >> (LogEntry::Interval(IntervalLogHistogram { tag, start_timestamp, @@ -670,11 +672,11 @@ named!(interval_hist<&[u8], LogEntry>, named!(log_entry<&[u8], LogEntry>, alt_complete!(start_time | base_time | interval_hist)); named!(comment_line<&[u8], ()>, - do_parse!(tag!("#") >> not_line_ending >> line_ending >> (())) + do_parse!(tag!("#") >> take_until_and_consume!("\n") >> (())) ); named!(legend<&[u8], ()>, - do_parse!(tag!("\"StartTimestamp\"") >> not_line_ending >> line_ending >> (())) + do_parse!(tag!("\"StartTimestamp\"") >> take_until_and_consume!("\n") >> (())) ); named!(ignored_line<&[u8], ()>, alt!(comment_line | legend)); diff --git a/src/serialization/interval_log/tests.rs b/src/serialization/interval_log/tests.rs index a591612..89525ba 100644 --- a/src/serialization/interval_log/tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -1,4 +1,4 @@ -use std::time; +use std::{iter, time}; use std::ops::Add; use super::super::super::*; @@ -35,6 +35,67 @@ fn write_header_then_interval_comment() { assert_eq!("#foo\n#bar\n#baz\n", str::from_utf8(&buf[..]).unwrap()); } +#[test] +fn write_comment_control_characters_still_parseable() { + let mut buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + let mut control_chars = String::new(); + + // control chars are U+0000-001F, 7F, 80-9F + for c_byte in (0..0x20_u8).chain(iter::once(0x7F)).chain(0x80..0xA0) { + let c = c_byte as char; + assert!(c.is_control()); + control_chars.push(c); + } + + assert_eq!(2 * 16 + 1 + 2 * 16, control_chars.chars().count()); + + { + let mut log_writer = IntervalLogWriterBuilder::new() + .add_comment("unicode") + .add_comment(&control_chars) + .add_comment("whew") + .with_start_time(system_time_after_epoch(123, 456_000_000)) + .begin_log_with(&mut buf, &mut serializer) + .unwrap(); + log_writer.write_comment("baz").unwrap(); + } + + let before_newline = &control_chars[0..10]; + let after_newline = &control_chars[11..]; + let expected = format!( + "#unicode\n#{}\n#{}\n#whew\n#[StartTime: 123.456 (seconds since epoch)]\n#baz\n", + before_newline, + after_newline + ); + assert_eq!(&expected, str::from_utf8(&buf[..]).unwrap()); + + let mut i = IntervalLogIterator::new(&buf); + assert_eq!(Some(Ok(LogEntry::StartTime(123.456))), i.next()); + assert_eq!(None, i.next()); +} + +#[test] +fn write_comment_newline_wraps() { + let mut buf = Vec::new(); + let mut serializer = V2Serializer::new(); + + { + let _ = IntervalLogWriterBuilder::new() + .add_comment("before") + .add_comment("new\nline") + .add_comment("after") + .begin_log_with(&mut buf, &mut serializer) + .unwrap(); + } + + assert_eq!( + "#before\n#new\n#line\n#after\n", + str::from_utf8(&buf[..]).unwrap() + ); +} + #[test] fn write_headers_multiple_times_only_last_is_used() { let mut buf = Vec::new(); From cf9ec24ebb092eafeb80447cda0829b80a1fce93 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Mon, 6 Nov 2017 13:03:52 -0600 Subject: [PATCH 18/18] Parse fractional seconds into a Duration directly. Parsing into an f64 then separately truncating to an int and multiplying the fractional part by 1_000_000_000 introduced errors in the fractional part since frequently abc.xyz cannot produce 0.xyz as an f64, but has to have 0.xyz0000001 or something like that. --- benches/record.rs | 6 +- src/serialization/benchmarks.rs | 6 +- src/serialization/interval_log/mod.rs | 76 ++++++++++++----- src/serialization/interval_log/tests.rs | 105 ++++++++++++++++++++---- 4 files changed, 149 insertions(+), 44 deletions(-) diff --git a/benches/record.rs b/benches/record.rs index 758988b..128ba63 100644 --- a/benches/record.rs +++ b/benches/record.rs @@ -175,7 +175,9 @@ fn do_add_benchmark Histogram>( addends.push(h); } - b.iter(|| for h in addends.iter() { - accum.add(h).unwrap(); + b.iter(|| { + for h in addends.iter() { + accum.add(h).unwrap(); + } }) } diff --git a/src/serialization/benchmarks.rs b/src/serialization/benchmarks.rs index 5bfceaf..972fbd7 100644 --- a/src/serialization/benchmarks.rs +++ b/src/serialization/benchmarks.rs @@ -63,8 +63,10 @@ fn do_varint_write_rand(b: &mut Bencher, range: Range) { } let mut buf = [0; 9]; - b.iter(|| for i in vec.iter() { - let _ = varint_write(*i, &mut buf); + b.iter(|| { + for i in vec.iter() { + let _ = varint_write(*i, &mut buf); + } }); } diff --git a/src/serialization/interval_log/mod.rs b/src/serialization/interval_log/mod.rs index 6472e6b..4ea70a4 100644 --- a/src/serialization/interval_log/mod.rs +++ b/src/serialization/interval_log/mod.rs @@ -166,7 +166,7 @@ extern crate base64; use std::{fmt, io, ops, str, time}; use std::fmt::Write; -use nom::{double, IResult}; +use nom::{double, is_digit, ErrorKind, IResult}; use super::super::{Counter, Histogram}; use super::Serializer; @@ -450,9 +450,8 @@ impl<'a> ops::Deref for Tag<'a> { #[derive(PartialEq, Debug)] pub struct IntervalLogHistogram<'a> { tag: Option>, - start_timestamp: f64, - // lazily map to Duration to save parsing time and a few bytes of space on ILH - duration: f64, + start_timestamp: time::Duration, + duration: time::Duration, max: f64, encoded_histogram: &'a str, } @@ -470,12 +469,12 @@ impl<'a> IntervalLogHistogram<'a> { /// the log, in which case you may wish to consider this number as a delta vs those timestamps. /// See the module-level documentation about timestamps. pub fn start_timestamp(&self) -> time::Duration { - fp_seconds_as_duration(self.start_timestamp) + self.start_timestamp } /// Duration of the interval in seconds. pub fn duration(&self) -> time::Duration { - fp_seconds_as_duration(self.duration) + self.duration } /// Max value in the encoded histogram @@ -506,9 +505,9 @@ impl<'a> IntervalLogHistogram<'a> { #[allow(variant_size_differences)] pub enum LogEntry<'a> { /// Logs may include a StartTime. If present, it represents seconds since the epoch. - StartTime(f64), + StartTime(time::Duration), /// Logs may include a BaseTime. If present, it represents seconds since the epoch. - BaseTime(f64), + BaseTime(time::Duration), /// An individual interval histogram. Interval(IntervalLogHistogram<'a>), } @@ -606,13 +605,6 @@ impl<'a> Iterator for IntervalLogIterator<'a> { } } -fn fp_seconds_as_duration(fp_secs: f64) -> time::Duration { - let secs = fp_secs as u64; - // can't overflow because this can be at most 1 billion which is approx 2^30 - let nsecs = (fp_secs.fract() * 1_000_000_000_f64) as u32; - time::Duration::new(secs, nsecs) -} - fn duration_as_fp_seconds(d: time::Duration) -> f64 { d.as_secs() as f64 + d.subsec_nanos() as f64 / 1_000_000_000_f64 } @@ -629,19 +621,19 @@ fn system_time_as_fp_seconds(time: time::SystemTime) -> f64 { named!(start_time<&[u8], LogEntry>, do_parse!( tag!("#[StartTime: ") >> - n: double >> + dur: fract_sec_duration >> char!(' ') >> take_until_and_consume!("\n") >> - (LogEntry::StartTime(n)) + (LogEntry::StartTime(dur)) )); named!(base_time<&[u8], LogEntry>, do_parse!( tag!("#[BaseTime: ") >> - n: double >> + dur: fract_sec_duration >> char!(' ') >> take_until_and_consume!("\n") >> - (LogEntry::BaseTime(n)) + (LogEntry::BaseTime(dur)) )); named!(interval_hist<&[u8], LogEntry>, @@ -652,9 +644,9 @@ named!(interval_hist<&[u8], LogEntry>, map!(pair!(tag!("Tag="), take_until_and_consume!(",")), |p| p.1), str::from_utf8), |s| Tag(s))) >> - start_timestamp: double >> + start_timestamp: fract_sec_duration >> char!(',') >> - duration: double >> + duration: fract_sec_duration >> char!(',') >> max: double >> char!(',') >> @@ -669,7 +661,8 @@ named!(interval_hist<&[u8], LogEntry>, ) ); -named!(log_entry<&[u8], LogEntry>, alt_complete!(start_time | base_time | interval_hist)); +named!(log_entry<&[u8], LogEntry>, + alt_complete!(start_time | base_time | interval_hist)); named!(comment_line<&[u8], ()>, do_parse!(tag!("#") >> take_until_and_consume!("\n") >> (())) @@ -681,5 +674,44 @@ named!(legend<&[u8], ()>, named!(ignored_line<&[u8], ()>, alt!(comment_line | legend)); +fn fract_sec_duration(input: &[u8]) -> IResult<&[u8], time::Duration> { + match fract_sec_tuple(input) { + IResult::Done(rest, data) => { + let (secs, nanos_str) = data; + + // only read up to 9 digits since we can only support nanos, not smaller precision + let nanos_parse_res = if nanos_str.len() > 9 { + nanos_str[0..9].parse::() + } else if nanos_str.len() == 9 { + nanos_str.parse::() + } else { + nanos_str + .parse::() + // subtraction will not overflow because len is < 9 + .map(|n| n * 10_u32.pow(9 - nanos_str.len() as u32)) + }; + + if let Ok(nanos) = nanos_parse_res { + return IResult::Done(rest, time::Duration::new(secs, nanos)); + } + + // nanos were invalid utf8. We don't expose these errors, so don't bother defining a + // custom error type. + return IResult::Error(ErrorKind::Custom(0)); + } + IResult::Error(e) => return IResult::Error(e), + IResult::Incomplete(n) => return IResult::Incomplete(n), + } +} + +named!(fract_sec_tuple<&[u8], (u64, &str)>, + do_parse!( + secs: flat_map!(recognize!(take_until!(".")), parse_to!(u64)) >> + tag!(".") >> + nanos_str: map_res!(take_while1!(is_digit), str::from_utf8) >> + (secs, nanos_str) + ) +); + #[cfg(test)] mod tests; diff --git a/src/serialization/interval_log/tests.rs b/src/serialization/interval_log/tests.rs index 89525ba..518a0b9 100644 --- a/src/serialization/interval_log/tests.rs +++ b/src/serialization/interval_log/tests.rs @@ -1,3 +1,7 @@ +extern crate rand; + +use self::rand::Rng; + use std::{iter, time}; use std::ops::Add; @@ -72,7 +76,12 @@ fn write_comment_control_characters_still_parseable() { assert_eq!(&expected, str::from_utf8(&buf[..]).unwrap()); let mut i = IntervalLogIterator::new(&buf); - assert_eq!(Some(Ok(LogEntry::StartTime(123.456))), i.next()); + assert_eq!( + Some(Ok( + LogEntry::StartTime(time::Duration::new(123, 456_000_000)) + )), + i.next() + ); assert_eq!(None, i.next()); } @@ -214,13 +223,73 @@ fn write_base_time() { ); } +#[test] +fn parse_duration_full_ns() { + let (rest, dur) = fract_sec_duration(b"123456.789012345foo").unwrap(); + + assert_eq!(time::Duration::new(123456, 789_012_345), dur); + assert_eq!(b"foo", rest); +} + +#[test] +fn parse_duration_scale_ns() { + let (rest, dur) = fract_sec_duration(b"123456.789012foo").unwrap(); + + assert_eq!(time::Duration::new(123456, 789_012_000), dur); + assert_eq!(b"foo", rest); +} + +#[test] +fn parse_duration_too_many_ns() { + let (rest, dur) = fract_sec_duration(b"123456.7890123456foo").unwrap(); + + // consumes all the numbers, but only parses the first 9 + assert_eq!(time::Duration::new(123456, 789_012_345), dur); + assert_eq!(b"foo", rest); +} + +#[test] +fn duration_fp_roundtrip_accuracy() { + let mut rng = rand::thread_rng(); + + let mut buf = String::new(); + let mut errors = Vec::new(); + for _ in 0..100_000 { + buf.clear(); + + // pick seconds + let secs = rng.gen_range(0, 2_000_000_000); + // pick nsecs that only has ms accuracy + let nsecs = rng.gen_range(0, 1000) * 1000_000; + + let dur = time::Duration::new(secs, nsecs); + let fp_secs = duration_as_fp_seconds(dur); + + write!(&mut buf, "{:.3}", fp_secs).unwrap(); + + let (_, dur2) = fract_sec_duration(buf.as_bytes()).unwrap(); + + if dur != dur2 { + errors.push((dur, dur2)); + } + } + + if !errors.is_empty() { + for &(dur, dur2) in &errors { + println!("{:?} -> {:?}", dur, dur2); + } + } + + assert_eq!(0, errors.len()); +} + #[test] fn parse_start_time_with_human_date() { let (rest, e) = start_time( b"#[StartTime: 1441812279.474 (seconds since epoch), Wed Sep 09 08:24:39 PDT 2015]\nfoo", ).unwrap(); - let expected = LogEntry::StartTime(1441812279.474); + let expected = LogEntry::StartTime(time::Duration::new(1441812279, 474_000_000)); assert_eq!(expected, e); assert_eq!(b"foo", rest); @@ -233,7 +302,7 @@ fn parse_start_time_without_human_date() { // Also, BaseTime doesn't have a human-formatted time. let (rest, e) = start_time(b"#[StartTime: 1441812279.474 (seconds since epoch)]\nfoo").unwrap(); - let expected = LogEntry::StartTime(1441812279.474); + let expected = LogEntry::StartTime(time::Duration::new(1441812279, 474_000_000)); assert_eq!(expected, e); assert_eq!(b"foo", rest); @@ -243,7 +312,7 @@ fn parse_start_time_without_human_date() { fn parse_base_time() { let (rest, e) = base_time(b"#[BaseTime: 1441812279.474 (seconds since epoch)]\nfoo").unwrap(); - let expected = LogEntry::BaseTime(1441812279.474); + let expected = LogEntry::BaseTime(time::Duration::new(1441812279, 474_000_000)); assert_eq!(expected, e); assert_eq!(b"foo", rest); @@ -271,8 +340,8 @@ fn parse_interval_hist_no_tag() { let expected = LogEntry::Interval(IntervalLogHistogram { tag: None, - start_timestamp: 0.127, - duration: 1.007, + start_timestamp: time::Duration::new(0, 127_000_000), + duration: time::Duration::new(1, 7_000_000), max: 2.769, encoded_histogram: "couldBeBase64", }); @@ -287,8 +356,8 @@ fn parse_interval_hist_with_tag() { let expected = LogEntry::Interval(IntervalLogHistogram { tag: Some(Tag("t")), - start_timestamp: 0.127, - duration: 1.007, + start_timestamp: time::Duration::new(0, 127_000_000), + duration: time::Duration::new(1, 7_000_000), max: 2.769, encoded_histogram: "couldBeBase64", }); @@ -311,13 +380,13 @@ fn iter_with_ignored_prefix() { let expected0 = LogEntry::Interval(IntervalLogHistogram { tag: Some(Tag("t")), - start_timestamp: 0.127, - duration: 1.007, + start_timestamp: time::Duration::new(0, 127_000_000), + duration: time::Duration::new(1, 7_000_000), max: 2.769, encoded_histogram: "couldBeBase64", }); - let expected1 = LogEntry::StartTime(1441812279.474); + let expected1 = LogEntry::StartTime(time::Duration::new(1441812279, 474_000_000)); assert_eq!(vec![expected0, expected1], entries) } @@ -334,13 +403,13 @@ fn iter_without_ignored_prefix() { let expected0 = LogEntry::Interval(IntervalLogHistogram { tag: Some(Tag("t")), - start_timestamp: 0.127, - duration: 1.007, + start_timestamp: time::Duration::new(0, 127_000_000), + duration: time::Duration::new(1, 7_000_000), max: 2.769, encoded_histogram: "couldBeBase64", }); - let expected1 = LogEntry::StartTime(1441812279.474); + let expected1 = LogEntry::StartTime(time::Duration::new(1441812279, 474_000_000)); assert_eq!(vec![expected0, expected1], entries) } @@ -363,14 +432,14 @@ fn iter_multiple_entrties_with_interleaved_ignored() { let expected0 = LogEntry::Interval(IntervalLogHistogram { tag: Some(Tag("t")), - start_timestamp: 0.127, - duration: 1.007, + start_timestamp: time::Duration::new(0, 127_000_000), + duration: time::Duration::new(1, 7_000_000), max: 2.769, encoded_histogram: "couldBeBase64", }); - let expected1 = LogEntry::StartTime(1441812279.474); - let expected2 = LogEntry::BaseTime(1441812279.474); + let expected1 = LogEntry::StartTime(time::Duration::new(1441812279, 474_000_000)); + let expected2 = LogEntry::BaseTime(time::Duration::new(1441812279, 474_000_000)); assert_eq!(vec![expected0, expected1, expected2], entries) }