[ADD] torchtext, otherwise, previous commit won't work ... 😣

Niger-Volta-LTI · May 28, 2019 · 4cd125f · 4cd125f
1 parent 23a6dda
commit 4cd125f
Show file tree

Hide file tree

Showing 21 changed files with 3,562 additions and 0 deletions.
diff --git a/src/torchtext/LICENSE b/src/torchtext/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) James Bradbury and Soumith Chintala 2016, 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/torchtext/__init__.py b/src/torchtext/__init__.py
@@ -0,0 +1,11 @@
+from . import data
+from . import datasets
+from . import utils
+from . import vocab
+
+__version__ = '0.4.0'
+
+__all__ = ['data',
+           'datasets',
+           'utils',
+           'vocab']
diff --git a/src/torchtext/data/__init__.py b/src/torchtext/data/__init__.py
@@ -0,0 +1,18 @@
+from .batch import Batch
+from .dataset import Dataset, TabularDataset
+from .example import Example
+from .field import RawField, Field, ReversibleField, SubwordField, NestedField, LabelField
+from .iterator import (batch, BucketIterator, Iterator, BPTTIterator,
+                       pool)
+from .pipeline import Pipeline
+from .utils import get_tokenizer, interleave_keys
+
+__all__ = ["Batch",
+           "Dataset", "TabularDataset",
+           "Example",
+           "RawField", "Field", "ReversibleField", "SubwordField", "NestedField",
+           "LabelField",
+           "batch", "BucketIterator", "Iterator", "BPTTIterator",
+           "pool",
+           "Pipeline",
+           "get_tokenizer", "interleave_keys"]
diff --git a/src/torchtext/data/batch.py b/src/torchtext/data/batch.py
@@ -0,0 +1,101 @@
+import torch
+
+
+class Batch(object):
+    """Defines a batch of examples along with its Fields.
+
+    Attributes:
+        batch_size: Number of examples in the batch.
+        dataset: A reference to the dataset object the examples come from
+            (which itself contains the dataset's Field objects).
+        train: Deprecated: this attribute is left for backwards compatibility,
+            however it is UNUSED as of the merger with pytorch 0.4.
+        input_fields: The names of the fields that are used as input for the model
+        target_fields: The names of the fields that are used as targets during
+                       model training
+
+    Also stores the Variable for each column in the batch as an attribute.
+    """
+
+    def __init__(self, data=None, dataset=None, device=None):
+        """Create a Batch from a list of examples."""
+        if data is not None:
+            self.batch_size = len(data)
+            self.dataset = dataset
+            self.fields = dataset.fields.keys()  # copy field names
+            self.input_fields = [k for k, v in dataset.fields.items() if
+                                 v is not None and not v.is_target]
+            self.target_fields = [k for k, v in dataset.fields.items() if
+                                  v is not None and v.is_target]
+
+            for (name, field) in dataset.fields.items():
+                if field is not None:
+                    batch = [getattr(x, name) for x in data]
+                    setattr(self, name, field.process(batch, device=device))
+
+    @classmethod
+    def fromvars(cls, dataset, batch_size, train=None, **kwargs):
+        """Create a Batch directly from a number of Variables."""
+        batch = cls()
+        batch.batch_size = batch_size
+        batch.dataset = dataset
+        batch.fields = dataset.fields.keys()
+        for k, v in kwargs.items():
+            setattr(batch, k, v)
+        return batch
+
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        if not self.__dict__:
+            return 'Empty {} instance'.format(torch.typename(self))
+
+        fields_to_index = filter(lambda field: field is not None, self.fields)
+        var_strs = '\n'.join(['\t[.' + name + ']' + ":" + _short_str(getattr(self, name))
+                              for name in fields_to_index if hasattr(self, name)])
+
+        data_str = (' from {}'.format(self.dataset.name.upper())
+                    if hasattr(self.dataset, 'name')
+                    and isinstance(self.dataset.name, str) else '')
+
+        strt = '[{} of size {}{}]\n{}'.format(torch.typename(self),
+                                              self.batch_size, data_str, var_strs)
+        return '\n' + strt
+
+    def __len__(self):
+        return self.batch_size
+
+    def _get_field_values(self, fields):
+        if len(fields) == 0:
+            return None
+        elif len(fields) == 1:
+            return getattr(self, fields[0])
+        else:
+            return tuple(getattr(self, f) for f in fields)
+
+    def __iter__(self):
+        yield self._get_field_values(self.input_fields)
+        yield self._get_field_values(self.target_fields)
+
+
+def _short_str(tensor):
+    # unwrap variable to tensor
+    if not torch.is_tensor(tensor):
+        # (1) unpack variable
+        if hasattr(tensor, 'data'):
+            tensor = getattr(tensor, 'data')
+        # (2) handle include_lengths
+        elif isinstance(tensor, tuple):
+            return str(tuple(_short_str(t) for t in tensor))
+        # (3) fallback to default str
+        else:
+            return str(tensor)
+
+    # copied from torch _tensor_str
+    size_str = 'x'.join(str(size) for size in tensor.size())
+    device_str = '' if not tensor.is_cuda else \
+        ' (GPU {})'.format(tensor.get_device())
+    strt = '[{} of size {}{}]'.format(torch.typename(tensor),
+                                      size_str, device_str)
+    return strt