apache · aaronmarkham · Feb 13, 2019 · Dec 14, 2018 · Dec 14, 2018 · Dec 27, 2018
@@ -0,0 +1,106 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+utils/*.dat
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+
+class BeamEntry:
+    "information about one single beam at specific time-step"
+    def __init__(self):
+        self.prTotal = 0 # blank and non-blank
+        self.prNonBlank = 0 # non-blank
+        self.prBlank = 0 # blank
+        self.prText = 1 # LM score
+        self.lmApplied = False # flag if LM was already applied to this beam
+        self.labeling = () # beam-labeling
+
+class BeamState:
+    "information about the beams at specific time-step"
+    def __init__(self):
+        self.entries = {}
+
+    def norm(self):
+        "length-normalise LM score"
+        for (k, _) in self.entries.items():
+            labelingLen = len(self.entries[k].labeling)
+            self.entries[k].prText = self.entries[k].prText ** (1.0 / (labelingLen if labelingLen else 1.0))
+
+    def sort(self):
+        "return beam-labelings, sorted by probability"
+        beams = [v for (_, v) in self.entries.items()]
+        sortedBeams = sorted(beams, reverse=True, key=lambda x: x.prTotal*x.prText)
+        return [x.labeling for x in sortedBeams]
+
+def applyLM(parentBeam, childBeam, classes, lm):
+    "calculate LM score of child beam by taking score from parent beam and bigram probability of last two chars"
+    if lm and not childBeam.lmApplied:
+        c1 = classes[parentBeam.labeling[-1] if parentBeam.labeling else classes.index(' ')] # first char
+        c2 = classes[childBeam.labeling[-1]] # second char
+        lmFactor = 0.01 # influence of language model
+        bigramProb = lm.getCharBigram(c1, c2) ** lmFactor # probability of seeing first and second char next to each other
+        childBeam.prText = parentBeam.prText * bigramProb # probability of char sequence
+        childBeam.lmApplied = True # only apply LM once per beam entry
+
+def addBeam(beamState, labeling):
+    "add beam if it does not yet exist"
+    if labeling not in beamState.entries:
+        beamState.entries[labeling] = BeamEntry()
+
+def ctcBeamSearch(mat, classes, lm, k, beamWidth):
+    "beam search as described by the paper of Hwang et al. and the paper of Graves et al."
+
+    blankIdx = len(classes)
+    maxT, maxC = mat.shape
+
+    # initialise beam state
+    last = BeamState()
+    labeling = ()
+    last.entries[labeling] = BeamEntry()
+    last.entries[labeling].prBlank = 1
+    last.entries[labeling].prTotal = 1
+
+    # go over all time-steps
+    for t in range(maxT):
+        curr = BeamState()
+
+        # get beam-labelings of best beams
+        bestLabelings = last.sort()[0:beamWidth]
+
+	# go over best beams
+        for labeling in bestLabelings:
+
+	    # probability of paths ending with a non-blank
+            prNonBlank = 0
+	    # in case of non-empty beam
+            if labeling:
+		# probability of paths with repeated last char at the end
+                try: 
+                    prNonBlank = last.entries[labeling].prNonBlank * mat[t, labeling[-1]]
+                except FloatingPointError:
+                    prNonBlank = 0
+
+	    # probability of paths ending with a blank
+            prBlank = (last.entries[labeling].prTotal) * mat[t, blankIdx]
+
+	    # add beam at current time-step if needed
+            addBeam(curr, labeling)
+
+            # fill in data
+            curr.entries[labeling].labeling = labeling
+            curr.entries[labeling].prNonBlank += prNonBlank
+            curr.entries[labeling].prBlank += prBlank
+            curr.entries[labeling].prTotal += prBlank + prNonBlank
+            curr.entries[labeling].prText = last.entries[labeling].prText # beam-labeling not changed, therefore also LM score unchanged from
+            curr.entries[labeling].lmApplied = True # LM already applied at previous time-step for this beam-labeling
+
+            # extend current beam-labeling
+            for c in range(maxC - 1):
+                # add new char to current beam-labeling
+                newLabeling = labeling + (c,)
+
+                # if new labeling contains duplicate char at the end, only consider paths ending with a blank
+                if labeling and labeling[-1] == c:
+                    prNonBlank = mat[t, c] * last.entries[labeling].prBlank
+                else:
+                    prNonBlank = mat[t, c] * last.entries[labeling].prTotal
+
+		# add beam at current time-step if needed
+                addBeam(curr, newLabeling)
+
+		# fill in data
+                curr.entries[newLabeling].labeling = newLabeling
+                curr.entries[newLabeling].prNonBlank += prNonBlank
+                curr.entries[newLabeling].prTotal += prNonBlank
+
+		# apply LM
+                applyLM(curr.entries[labeling], curr.entries[newLabeling], classes, lm)
+
+        # set new beam state
+        last = curr
+
+    # normalise LM scores according to beam-labeling-length
+    last.norm()
+
+    # sort by probability
+    bestLabelings = last.sort()[:k] # get most probable labeling
+
+    output = []
+    for bestLabeling in bestLabelings:
+        # map labels to chars
+        res = ''
+        for l in bestLabeling:
+            res += classes[l]
+        output.append(res)
+    return output
diff --git a/example/gluon/lipnet/LICENSE b/example/gluon/lipnet/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Deep Learning Student T1000
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,110 @@
+# LipNet: End-to-End Sentence-level Lipreading
-# LipNet: End-to-End Sentence-level Lipreading
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+# LipNet: End-to-End Sentence-level Lipreading
-# LipNet: End-to-End Sentence-level Lipreading
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+# LipNet: End-to-End Sentence-level Lipreading
+
+---
+
+Gluon inplementation of [LipNet: End-to-End Sentence-level Lipreading](https://arxiv.org/abs/1611.01599)
+
+![net_structure](asset/network_structure.png)
+
+## Requirements
+- Python 3.6.4
+- MXnet 1.3.0
+
+
+## Test Environment
+- 4 CPU cores
+- 1 GPU (Tesla K80 12GB)
+
+
+## The Data
+- The GRID audiovisual sentence corpus (http://spandh.dcs.shef.ac.uk/gridcorpus/)
+- Video: (normal)(480 M each)
+- Align: word alignments(190 K each) 
+
+## Prepare the Data
+### Download the data
+- arguments
+  - src_path : Path for videos (default='./data/mp4s/')
+  - align_path : Path for aligns (default='./data/align/')
+  - n_process : num of process (default=1)
+
+```
+cd ./utils && python download_data.py
+```
+
+### Preprocess the Data: Extracting the mouth images from a video and save it.
+- arguments
+  - src_path : Path for videos (default='./data/mp4s/')
+  - tgt_path : Path for preprocessed images (default='./data/datasets/')
+  - n_process : num of process (default=1)
+
-
+You can run the preprocessing with just one processor, but this will take a long time (>48 hours). To use all of the available processors, use the following command: 
-
+You can run the preprocessing with just one processor, but this will take a long time (>48 hours). To use all of the available processors, use the following command: 
+```
+cd ./utils && python preprocess_data.py
-cd ./utils && python preprocess_data.py
+cd ./utils && python preprocess_data.py --n_process $(nproc)
-cd ./utils && python preprocess_data.py
+cd ./utils && python preprocess_data.py --n_process $(nproc)
+```
+
+## Data Structure
+
+```
+The training data folder should look like : 
+<train_data_root>
+                |--datasets
+                        |--s1
+                           |--bbir7s
+                               |--mouth_000.png
+                               |--mouth_001.png
+                                   ...
+                           |--bgaa8p
+                               |--mouth_000.png
+                               |--mouth_001.png
+                                  ...
+                        |--s2
+                            ...
+                 |--align
+                         |--bw1d8a.align
+                         |--bggzzs.align
+                             ...
+
+```
+
+
+## Training
+
+- arguments
+  - batch_size : Define batch size (defualt=64)
+  - epochs : Define total epochs (default=100)
+  - image_path : Path for lip image files (default='./data/datasets/')
+  - align_path : Path for align files (default='./data/align/')
+  - dr_rate : dropout rate(default=0.5)
+  - use_gpu : Use gpu (default=True)
+  - num_workers : num of workers when generating data (default=2)
+
+```
+python main.py
+```
+
+## Results
+```
+[Target]
+['lay green with a zero again',
+ 'bin blue with r nine please',
+ 'set blue with e five again',
+ 'bin green by t seven soon',
+ 'lay red at d five now',
+ 'bin green in x eight now',
+ 'bin blue with e one now',
+ 'lay red at j nine now']
+ ```
+
+ ```
+[Pred]
+['lay green with s zero again',
+ 'bin blue with r nine please',
+ 'set blue with e five again',
+ 'bin green by t seven soon',
+ 'lay red at c five now',
+ 'bin green in x eight now',
+ 'bin blue with m one now',
+ 'lay red at j nine now']
+ ```
+
+