From c0d0c2c849c4df471bd7229486cb6e8f7c4e8536 Mon Sep 17 00:00:00 2001 From: Steven Atkinson Date: Sun, 2 Apr 2023 12:31:06 -0700 Subject: [PATCH] Output loudness normalization (#18) * Implement loudness * Apply normalization if active * Fix JSON find * Adjust target loudness --- NAM/dsp.cpp | 33 ++++++++++++++++++++++++--------- NAM/dsp.h | 17 ++++++++++++++++- NAM/get_dsp.cpp | 19 ++++++++++++++----- NAM/lstm.cpp | 6 +++++- NAM/lstm.h | 2 ++ NAM/wavenet.cpp | 8 +++++++- NAM/wavenet.h | 3 +++ 7 files changed, 71 insertions(+), 17 deletions(-) diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index d1ae317..b733049 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -15,9 +15,11 @@ #define tanh_impl_ std::tanh // #define tanh_impl_ fast_tanh_ -constexpr auto _INPUT_BUFFER_SAFETY_FACTOR = 32; +constexpr const long _INPUT_BUFFER_SAFETY_FACTOR = 32; -DSP::DSP() { this->_stale_params = true; } +DSP::DSP() : mLoudness(TARGET_DSP_LOUDNESS), mNormalizeOutputLoudness(false), _stale_params(true) {} + +DSP::DSP(const double loudness) : mLoudness(loudness), mNormalizeOutputLoudness(false), _stale_params(true) {} void DSP::process(double **inputs, double **outputs, const int num_channels, const int num_frames, @@ -71,14 +73,18 @@ void DSP::_process_core_() { void DSP::_apply_output_level_(double **outputs, const int num_channels, const int num_frames, const double gain) { - for (int c = 0; c < num_channels; c++) + const double loudnessGain = pow(10.0, -(this->mLoudness - TARGET_DSP_LOUDNESS) / 20.0); + const double finalGain = this->mNormalizeOutputLoudness ? gain * loudnessGain : gain; + for (int c = 0; c < num_channels; c++) for (int s = 0; s < num_frames; s++) - outputs[c][s] = double(gain * this->_core_dsp_output[s]); + outputs[c][s] = double(finalGain * this->_core_dsp_output[s]); } // Buffer ===================================================================== -Buffer::Buffer(const int receptive_field) : DSP() { +Buffer::Buffer(const int receptive_field) : Buffer(TARGET_DSP_LOUDNESS, receptive_field) {} + +Buffer::Buffer(const double loudness, const int receptive_field) : DSP(loudness) { this->_set_receptive_field(receptive_field); } @@ -146,8 +152,13 @@ void Buffer::finalize_(const int num_frames) { // Linear ===================================================================== Linear::Linear(const int receptive_field, const bool _bias, - const std::vector ¶ms) - : Buffer(receptive_field) { + const std::vector& params) : Linear(TARGET_DSP_LOUDNESS, receptive_field, _bias, params) +{} + +Linear::Linear(const double loudness, const int receptive_field, const bool _bias, + const std::vector& params) + : Buffer(loudness, receptive_field) +{ if (params.size() != (receptive_field + (_bias ? 1 : 0))) throw std::runtime_error("Params vector does not match expected size based " "on architecture parameters"); @@ -426,10 +437,14 @@ void convnet::_Head::process_(const Eigen::MatrixXf &input, output(i) = this->_bias + input.col(j).dot(this->_weight); } -convnet::ConvNet::ConvNet(const int channels, const std::vector &dilations, +convnet::ConvNet::ConvNet(const int channels, const std::vector& dilations, + const bool batchnorm, const std::string activation, + std::vector& params) : ConvNet(TARGET_DSP_LOUDNESS, channels, dilations, batchnorm, activation, params) {} + +convnet::ConvNet::ConvNet(const double loudness, const int channels, const std::vector &dilations, const bool batchnorm, const std::string activation, std::vector ¶ms) - : Buffer(*std::max_element(dilations.begin(), dilations.end())) { + : Buffer(loudness, *std::max_element(dilations.begin(), dilations.end())) { this->_verify_params(channels, dilations, batchnorm, params.size()); this->_blocks.resize(dilations.size()); std::vector::iterator it = params.begin(); diff --git a/NAM/dsp.h b/NAM/dsp.h index a0467a5..a317caf 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -28,9 +28,13 @@ class DSPParam { }; // And the params shall be provided as a std::vector. +// How loud do we want the models to be? in dB +#define TARGET_DSP_LOUDNESS -18.0 + class DSP { public: - DSP(); + DSP(); + DSP(const double loudness); // process() does all of the processing requried to take `inputs` array and // fill in the required values on `outputs`. // To do this: @@ -51,8 +55,13 @@ class DSP { // that actually uses them, which varies depends on the particulars of the // DSP subclass implementation. virtual void finalize_(const int num_frames); + void SetNormalize(const bool normalize) { this->mNormalizeOutputLoudness = normalize; }; protected: + // How loud is the model? + double mLoudness; + // Should we normalize according to this loudness? + bool mNormalizeOutputLoudness; // Parameters (aka "knobs") std::unordered_map _params; // If the params have changed since the last buffer was processed: @@ -94,6 +103,7 @@ class DSP { class Buffer : public DSP { public: Buffer(const int receptive_field); + Buffer(const double loudness, const int receptive_field); void finalize_(const int num_frames); protected: @@ -119,6 +129,8 @@ class Linear : public Buffer { public: Linear(const int receptive_field, const bool _bias, const std::vector ¶ms); + Linear(const double loudness, const int receptive_field, const bool _bias, + const std::vector& params); void _process_core_() override; protected: @@ -270,6 +282,9 @@ class ConvNet : public Buffer { ConvNet(const int channels, const std::vector &dilations, const bool batchnorm, const std::string activation, std::vector ¶ms); + ConvNet(const double loudness, const int channels, const std::vector& dilations, + const bool batchnorm, const std::string activation, + std::vector& params); protected: std::vector _blocks; diff --git a/NAM/get_dsp.cpp b/NAM/get_dsp.cpp index 805c94c..9689f79 100644 --- a/NAM/get_dsp.cpp +++ b/NAM/get_dsp.cpp @@ -46,11 +46,20 @@ std::unique_ptr get_dsp(const std::filesystem::path config_filename) { auto architecture = j["architecture"]; nlohmann::json config = j["config"]; std::vector params = _get_weights(j, config_filename); + bool haveLoudness = false; + double loudness = TARGET_DSP_LOUDNESS; + if (j.find("metadata") != j.end()) { + if (j["metadata"].find("loudness") != j["metadata"].end()) { + loudness = j["metadata"]["loudness"]; + haveLoudness = true; + } + } + if (architecture == "Linear") { const int receptive_field = config["receptive_field"]; const bool _bias = config["bias"]; - return std::make_unique(receptive_field, _bias, params); + return std::make_unique(loudness, receptive_field, _bias, params); } else if (architecture == "ConvNet") { const int channels = config["channels"]; const bool batchnorm = config["batchnorm"]; @@ -58,20 +67,20 @@ std::unique_ptr get_dsp(const std::filesystem::path config_filename) { for (int i = 0; i < config["dilations"].size(); i++) dilations.push_back(config["dilations"][i]); const std::string activation = config["activation"]; - return std::make_unique(channels, dilations, batchnorm, + return std::make_unique(loudness, channels, dilations, batchnorm, activation, params); } else if (architecture == "LSTM") { const int num_layers = config["num_layers"]; const int input_size = config["input_size"]; const int hidden_size = config["hidden_size"]; auto json = nlohmann::json{}; - return std::make_unique(num_layers, input_size, hidden_size, + return std::make_unique(loudness, num_layers, input_size, hidden_size, params, json); } else if (architecture == "CatLSTM") { const int num_layers = config["num_layers"]; const int input_size = config["input_size"]; const int hidden_size = config["hidden_size"]; - return std::make_unique(num_layers, input_size, hidden_size, + return std::make_unique(loudness, num_layers, input_size, hidden_size, params, config["parametric"]); } else if (architecture == "WaveNet" || architecture == "CatWaveNet") { std::vector layer_array_params; @@ -94,7 +103,7 @@ std::unique_ptr get_dsp(const std::filesystem::path config_filename) { auto parametric_json = architecture == "CatWaveNet" ? config["parametric"] : nlohmann::json{}; return std::make_unique( - layer_array_params, head_scale, with_head, parametric_json, params); + loudness, layer_array_params, head_scale, with_head, parametric_json, params); } else { throw std::runtime_error("Unrecognized architecture"); } diff --git a/NAM/lstm.cpp b/NAM/lstm.cpp index e235a71..03e6503 100644 --- a/NAM/lstm.cpp +++ b/NAM/lstm.cpp @@ -51,8 +51,12 @@ void lstm::LSTMCell::process_(const Eigen::VectorXf &x) { } lstm::LSTM::LSTM(const int num_layers, const int input_size, + const int hidden_size, std::vector& params, + nlohmann::json& parametric) : LSTM(TARGET_DSP_LOUDNESS, num_layers, input_size, hidden_size, params, parametric) {} + +lstm::LSTM::LSTM(const double loudness, const int num_layers, const int input_size, const int hidden_size, std::vector ¶ms, - nlohmann::json ¶metric) { + nlohmann::json ¶metric) : DSP(loudness) { this->_init_parametric(parametric); std::vector::iterator it = params.begin(); for (int i = 0; i < num_layers; i++) diff --git a/NAM/lstm.h b/NAM/lstm.h index 2f83d04..8aadbcf 100644 --- a/NAM/lstm.h +++ b/NAM/lstm.h @@ -53,6 +53,8 @@ class LSTM : public DSP { public: LSTM(const int num_layers, const int input_size, const int hidden_size, std::vector ¶ms, nlohmann::json ¶metric); + LSTM(const double loudness, const int num_layers, const int input_size, const int hidden_size, + std::vector& params, nlohmann::json& parametric); protected: Eigen::VectorXf _head_weight; diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index 59d10b5..ea5dcd8 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -227,10 +227,16 @@ void wavenet::_Head::_apply_activation_(Eigen::MatrixXf &x) { // WaveNet ==================================================================== wavenet::WaveNet::WaveNet( + const std::vector& layer_array_params, + const float head_scale, const bool with_head, nlohmann::json parametric, + std::vector params) : WaveNet(TARGET_DSP_LOUDNESS, layer_array_params, head_scale, with_head, parametric, params) {} + +wavenet::WaveNet::WaveNet( + const double loudness, const std::vector &layer_array_params, const float head_scale, const bool with_head, nlohmann::json parametric, std::vector params) - : //_head(channels, head_layers, head_channels, head_activation), + : DSP(loudness), _num_frames(0), _head_scale(head_scale) { if (with_head) throw std::runtime_error("Head not implemented!"); diff --git a/NAM/wavenet.h b/NAM/wavenet.h index 7ae61e0..cb22991 100644 --- a/NAM/wavenet.h +++ b/NAM/wavenet.h @@ -164,6 +164,9 @@ class WaveNet : public DSP { WaveNet(const std::vector &layer_array_params, const float head_scale, const bool with_head, nlohmann::json parametric, std::vector params); + WaveNet(const double loudness, const std::vector& layer_array_params, + const float head_scale, const bool with_head, + nlohmann::json parametric, std::vector params); // WaveNet(WaveNet&&) = default; // WaveNet& operator=(WaveNet&&) = default;