jishengpeng · saveriyo · Sep 11, 2024 · Aug 31, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,168 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+wavtokenizer/data/train
+wavtokenizer/data/infer
+wavtokenizer/result
+wavtokenizer/metrics/*.ckpt
+wavtokenizer/metrics/*.pt
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ To use WavTokenizer, install it using:
 ```bash
 conda create -n wavtokenizer python=3.9
 conda activate wavtokenizer
-pip install -r requirements.txt
+pip install -e .
 ```
 
 ## Infer
@@ -36,10 +36,10 @@ pip install -r requirements.txt
 
 ```python
 
-from encoder.utils import convert_audio
+from wavtokenizer.encoder.utils import convert_audio
 import torchaudio
 import torch
-from decoder.pretrained import WavTokenizer
+from wavtokenizer.decoder.pretrained import WavTokenizer
 
 
 device=torch.device('cpu')
@@ -65,10 +65,10 @@ torchaudio.save(audio_outpath, audio_out, sample_rate=24000, encoding='PCM_S', b
 ### Part2: Generating discrete codecs
 ```python
 
-from encoder.utils import convert_audio
+from wavtokenizer.encoder.utils import convert_audio
 import torchaudio
 import torch
-from decoder.pretrained import WavTokenizer
+from wavtokenizer.decoder.pretrained import WavTokenizer
 
 device=torch.device('cpu')
 

diff --git a/decoder/__init__.py b/decoder/__init__.py
diff --git a/decoder/__pycache__/__init__.cpython-310.pyc b/decoder/__pycache__/__init__.cpython-310.pyc
diff --git a/decoder/__pycache__/__init__.cpython-38.pyc b/decoder/__pycache__/__init__.cpython-38.pyc
diff --git a/decoder/__pycache__/dataset.cpython-310.pyc b/decoder/__pycache__/dataset.cpython-310.pyc
diff --git a/decoder/__pycache__/discriminator_dac.cpython-310.pyc b/decoder/__pycache__/discriminator_dac.cpython-310.pyc
diff --git a/decoder/__pycache__/discriminators.cpython-310.pyc b/decoder/__pycache__/discriminators.cpython-310.pyc
diff --git a/decoder/__pycache__/experiment.cpython-310.pyc b/decoder/__pycache__/experiment.cpython-310.pyc
diff --git a/decoder/__pycache__/feature_extractors.cpython-310.pyc b/decoder/__pycache__/feature_extractors.cpython-310.pyc
diff --git a/decoder/__pycache__/feature_extractors.cpython-38.pyc b/decoder/__pycache__/feature_extractors.cpython-38.pyc
diff --git a/decoder/__pycache__/heads.cpython-310.pyc b/decoder/__pycache__/heads.cpython-310.pyc
diff --git a/decoder/__pycache__/helpers.cpython-310.pyc b/decoder/__pycache__/helpers.cpython-310.pyc
diff --git a/decoder/__pycache__/loss.cpython-310.pyc b/decoder/__pycache__/loss.cpython-310.pyc
diff --git a/decoder/__pycache__/models.cpython-310.pyc b/decoder/__pycache__/models.cpython-310.pyc
diff --git a/decoder/__pycache__/modules.cpython-310.pyc b/decoder/__pycache__/modules.cpython-310.pyc
diff --git a/decoder/__pycache__/modules.cpython-38.pyc b/decoder/__pycache__/modules.cpython-38.pyc
diff --git a/decoder/__pycache__/pretrained.cpython-310.pyc b/decoder/__pycache__/pretrained.cpython-310.pyc
diff --git a/decoder/__pycache__/pretrained.cpython-38.pyc b/decoder/__pycache__/pretrained.cpython-38.pyc
diff --git a/decoder/__pycache__/pretrained_model.cpython-310.pyc b/decoder/__pycache__/pretrained_model.cpython-310.pyc
diff --git a/decoder/__pycache__/spectral_ops.cpython-310.pyc b/decoder/__pycache__/spectral_ops.cpython-310.pyc
diff --git a/encoder/__pycache__/__init__.cpython-310.pyc b/encoder/__pycache__/__init__.cpython-310.pyc
diff --git a/encoder/__pycache__/__init__.cpython-38.pyc b/encoder/__pycache__/__init__.cpython-38.pyc
diff --git a/encoder/__pycache__/distrib.cpython-310.pyc b/encoder/__pycache__/distrib.cpython-310.pyc
diff --git a/encoder/__pycache__/distrib.cpython-38.pyc b/encoder/__pycache__/distrib.cpython-38.pyc
diff --git a/encoder/__pycache__/model.cpython-310.pyc b/encoder/__pycache__/model.cpython-310.pyc
diff --git a/encoder/__pycache__/model.cpython-38.pyc b/encoder/__pycache__/model.cpython-38.pyc
diff --git a/encoder/__pycache__/utils.cpython-310.pyc b/encoder/__pycache__/utils.cpython-310.pyc
diff --git a/encoder/__pycache__/utils.cpython-38.pyc b/encoder/__pycache__/utils.cpython-38.pyc
diff --git a/encoder/modules/__pycache__/__init__.cpython-310.pyc b/encoder/modules/__pycache__/__init__.cpython-310.pyc
diff --git a/encoder/modules/__pycache__/__init__.cpython-38.pyc b/encoder/modules/__pycache__/__init__.cpython-38.pyc
diff --git a/encoder/modules/__pycache__/conv.cpython-310.pyc b/encoder/modules/__pycache__/conv.cpython-310.pyc
diff --git a/encoder/modules/__pycache__/conv.cpython-38.pyc b/encoder/modules/__pycache__/conv.cpython-38.pyc
diff --git a/encoder/modules/__pycache__/lstm.cpython-310.pyc b/encoder/modules/__pycache__/lstm.cpython-310.pyc
diff --git a/encoder/modules/__pycache__/lstm.cpython-38.pyc b/encoder/modules/__pycache__/lstm.cpython-38.pyc
diff --git a/encoder/modules/__pycache__/norm.cpython-310.pyc b/encoder/modules/__pycache__/norm.cpython-310.pyc
diff --git a/encoder/modules/__pycache__/norm.cpython-38.pyc b/encoder/modules/__pycache__/norm.cpython-38.pyc
diff --git a/encoder/modules/__pycache__/seanet.cpython-310.pyc b/encoder/modules/__pycache__/seanet.cpython-310.pyc
diff --git a/encoder/modules/__pycache__/seanet.cpython-38.pyc b/encoder/modules/__pycache__/seanet.cpython-38.pyc
diff --git a/encoder/modules/__pycache__/transformer.cpython-310.pyc b/encoder/modules/__pycache__/transformer.cpython-310.pyc
diff --git a/encoder/modules/__pycache__/transformer.cpython-38.pyc b/encoder/modules/__pycache__/transformer.cpython-38.pyc
diff --git a/encoder/quantization/__pycache__/__init__.cpython-310.pyc b/encoder/quantization/__pycache__/__init__.cpython-310.pyc
diff --git a/encoder/quantization/__pycache__/__init__.cpython-38.pyc b/encoder/quantization/__pycache__/__init__.cpython-38.pyc
diff --git a/encoder/quantization/__pycache__/core_vq.cpython-310.pyc b/encoder/quantization/__pycache__/core_vq.cpython-310.pyc
diff --git a/encoder/quantization/__pycache__/core_vq.cpython-38.pyc b/encoder/quantization/__pycache__/core_vq.cpython-38.pyc
diff --git a/encoder/quantization/__pycache__/vq.cpython-310.pyc b/encoder/quantization/__pycache__/vq.cpython-310.pyc
diff --git a/encoder/quantization/__pycache__/vq.cpython-38.pyc b/encoder/quantization/__pycache__/vq.cpython-38.pyc
diff --git a/metrics/__pycache__/UTMOS.cpython-310.pyc b/metrics/__pycache__/UTMOS.cpython-310.pyc
diff --git a/metrics/__pycache__/periodicity.cpython-310.pyc b/metrics/__pycache__/periodicity.cpython-310.pyc
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,42 @@
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "wavtokenizer"
+version = "0.1.0"
+description = "SOTA Discrete Codec Models With Forty Tokens Per Second for Audio Language Modeling"
+authors = [
+    { name = "Shengpeng Ji" },
+]
+license = { text = "MIT" }
+readme = "README.md"
+requires-python = ">=3.6"
+keywords = ["speech", "audio", "tokenizer", "deep learning", "pytorch"]
+dependencies = [
+    "torch>=2.0.0",
+    "torchaudio>=2.0.1",
+    "scipy>=1.10.1",
+    "huggingface_hub>=0.23.0",
+    "matplotlib>=3.7.1",
+    "transformers>=4.28.1",
+    "lightning==2.0.9",
+    "numpy>=1.23.5",
+    "jsonargparse[signatures]>=4.15.2",
+    "einops>=0.6.1",
+    "pyyaml>=6.0",
+    "encodec>=0.1.1",
+    "tensorboardX>=2.6",
+    "soundfile>=0.12.1",
+    "fairseq",
+    "torchcrepe",
+    "librosa",
+    "pesq"
+]
+
+[project.urls]
+"Homepage" = "https://github.com/jishengpeng/WavTokenizer"
+"Documentation" = "https://github.com/jishengpeng/WavTokenizer#readme"
+"Source Code" = "https://github.com/jishengpeng/WavTokenizer"
+"Paper" = "https://arxiv.org/abs/2408.16532"
+"Demo" = "https://wavtokenizer.github.io/"
diff --git a/requirements.txt b/requirements.txt
diff --git a/wavtokenizer/__init__.py b/wavtokenizer/__init__.py
diff --git a/...s_nq1_code4096_dim512_kmeans200_attn.yaml → ...s_nq1_code4096_dim512_kmeans200_attn.yaml b/...s_nq1_code4096_dim512_kmeans200_attn.yaml → ...s_nq1_code4096_dim512_kmeans200_attn.yaml
@@ -1,24 +1,24 @@
 seed_everything: 3407
 
 data:
-  class_path: decoder.dataset.VocosDataModule
+  class_path: wavtokenizer.decoder.dataset.VocosDataModule
   init_args:
     train_params:
-      filelist_path: ./WavTokenizer/data/train/libritts_train
+      filelist_path: ./data/train/libritts_train
       sampling_rate: 24000
       num_samples: 72000
       batch_size: 40  # 20
       num_workers: 8
 
     val_params:
-      filelist_path: ./WavTokenizer/data/infer/librttts_val
+      filelist_path: ./data/infer/libritts_val
       sampling_rate: 24000
       num_samples: 72000
       batch_size: 5   # 10
       num_workers: 8
 
 model:
-  class_path: decoder.experiment.WavTokenizer
+  class_path: wavtokenizer.decoder.experiment.WavTokenizer
   init_args:
     sample_rate: 24000
     initial_learning_rate: 2e-4
@@ -33,11 +33,11 @@ model:
     evaluate_periodicty: true
 
     resume: false
-    resume_config: ./WavTokenizer/configs/wavtokenizer_smalldata_frame40_3s_nq1_code16384_dim512_kmeans800_attn.yaml
-    resume_model: ./version_3/checkpoints/xxx.ckpt
+    resume_config: ./configs/wavtokenizer_smalldata_frame40_3s_nq1_code16384_dim512_kmeans800_attn.yaml
+    resume_model: ./version_3/checkpoints/vocos_checkpoint_epoch=31_step=157696_val_loss=5.9855.ckpt
 
     feature_extractor:
-      class_path: decoder.feature_extractors.EncodecFeatures
+      class_path: wavtokenizer.decoder.feature_extractors.EncodecFeatures
       init_args:
         encodec_model: encodec_24khz
         bandwidths: [6.6, 6.6, 6.6, 6.6]
@@ -48,7 +48,7 @@ model:
         vq_kmeans: 200
 
     backbone:
-      class_path: decoder.models.VocosBackbone
+      class_path: wavtokenizer.decoder.models.VocosBackbone
       init_args:
         input_channels: 512
         dim: 768
@@ -57,7 +57,7 @@ model:
         adanorm_num_embeddings: 4  
 
     head:
-      class_path: decoder.heads.ISTFTHead
+      class_path: wavtokenizer.decoder.heads.ISTFTHead
       init_args:
         dim: 768
         n_fft: 2400 
@@ -66,28 +66,28 @@ model:
 
 trainer:
   logger:
-    class_path: pytorch_lightning.loggers.TensorBoardLogger
+    class_path: lightning.pytorch.loggers.TensorBoardLogger
     init_args:
-      save_dir: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn/
+      save_dir: ./result/train/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn/
   callbacks:
-    - class_path: pytorch_lightning.callbacks.LearningRateMonitor
-    - class_path: pytorch_lightning.callbacks.ModelSummary
+    - class_path: lightning.pytorch.callbacks.LearningRateMonitor
+    - class_path: lightning.pytorch.callbacks.ModelSummary
       init_args:
         max_depth: 2
-    - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+    - class_path: lightning.pytorch.callbacks.ModelCheckpoint
       init_args:
         monitor: val_loss
         filename: wavtokenizer_checkpoint_{epoch}_{step}_{val_loss:.4f}
         save_top_k: 10
         save_last: true
-    - class_path: decoder.helpers.GradNormCallback
+    - class_path: wavtokenizer.decoder.helpers.GradNormCallback
 
   # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
   # This equals to 1M steps per generator and 1M per discriminator
   max_steps: 20000000
   # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
   limit_val_batches: 200
   accelerator: gpu
-  strategy: ddp
+  strategy: ddp_find_unused_parameters_true # auto for non DDP
   devices: [0,1,2,3,4,5,6,7]
   log_every_n_steps: 1000