Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SparseLSTM #175

Merged
merged 3 commits into from
Nov 2, 2018
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
[Unreleased]
------------

Added
^^^^^^^
* `@senwu`_: Add ``SparseLSTM`` support to save LSTM memory for large application.
(`#175 <https://github.com/HazyResearch/fonduer/pull/175>`_)

.. note::
With the SparseLSTM discriminative model, we save memory for the origin LSTM
model. In Fonduer v0.3.5, SparseLSTM is as follows:

.. code:: python

disc_model = SparseLSTM()
disc_model.train(
(train_cands, train_feature), train_marginals, n_epochs=5, lr=0.001
)


Fixed
^^^^^
* `@senwu`_: Use the latest MeTaL syntax and fix flake8 issues.
Expand Down
18 changes: 18 additions & 0 deletions docs/user/learning.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,24 @@ The learning parameters of different models are described below::
LogisticRegression:
# bias term
bias: False
# Sparse LSTM model
SparseLSTM:
# Word embedding dimension size
emb_dim: 100
# The number of features in the LSTM hidden state
hidden_dim: 100
# Use attention or not (Options: True or False)
attention: True
# Dropout parameter
dropout: 0.1
# Use bidirectional LSTM or not (Options: True or False)
bidirectional: True
# Prefered host device (Options: CPU or GPU)
host_device: "CPU"
# Maximum sentence length of LSTM input
max_sentence_length: 100
# bias term
bias: False
# Sparse Logistic Regression model
SparseLogisticRegression:
# bias term
Expand Down
3 changes: 2 additions & 1 deletion src/fonduer/learning/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from fonduer.learning.disc_models.logistic_regression import LogisticRegression
from fonduer.learning.disc_models.lstm import LSTM
from fonduer.learning.disc_models.sparse_lstm import SparseLSTM
from fonduer.learning.disc_models.sparse_logistic_regression import (
SparseLogisticRegression,
)

__all__ = ["LogisticRegression", "LSTM", "SparseLogisticRegression"]
__all__ = ["LogisticRegression", "LSTM", "SparseLogisticRegression", "SparseLSTM"]
4 changes: 3 additions & 1 deletion src/fonduer/learning/disc_models/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def forward(self, x, f):
"""Forward function.

:param x: The sequence input (batch) of the model.
:type x: torch.Tensor of shape (sequence_len * batch_size)
:type x: list of torch.Tensor of shape (sequence_len * batch_size)
:param f: The feature input of the model.
:type f: torch.Tensor of shape (batch_size * feature_size)
:return: The output of LSTM layer.
Expand Down Expand Up @@ -214,8 +214,10 @@ def _calc_logits(self, X, batch_size=None):

# TODO: optimize this
sequences = []
# For loop each relation arity
for i in range(len(C[0])):
sequence = []
# Generate sequence for the batch
for j in range(batch_st, batch_ed):
sequence.append(C[j][i])
x, x_mask = pad_batch(sequence, self.settings["max_sentence_length"])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ def forward(self, x, w):
"""
Run forward pass.

:param x: The input (batch) of the model.
:param x: The input feature (batch) of the model.
:type x: torch.Tensor of shape (batch_size, num_classes)
:param w: The input feature weight (batch) of the model.
:type w: torch.Tensor of shape (batch_size, num_classes)
:return: The output of sparse Logistic Regression layer.
:rtype: torch.Tensor of shape (batch_size, num_classes)
"""
Expand All @@ -41,8 +43,7 @@ def _check_input(self, X):
def _preprocess_data(self, X, Y=None, idxs=None, train=False):
"""
Preprocess the data:
1. Convert sparse matrix to dense matrix.
2. Select subset of the input if idxs exists.
1. Select subset of the input if idxs exists.

:param X: The input data of the model.
:type X: pair with candidates and corresponding features
Expand Down
294 changes: 294 additions & 0 deletions src/fonduer/learning/disc_models/sparse_lstm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
import numpy as np
import torch
import torch.nn as nn

from fonduer.learning.disc_learning import NoiseAwareModel
from fonduer.learning.disc_models.layers.rnn import RNN
from fonduer.learning.disc_models.layers.sparse_linear import SparseLinear
from fonduer.learning.disc_models.utils import (
SymbolTable,
mark_sentence,
mention_to_tokens,
pad_batch,
)
from fonduer.utils.config import get_config


class SparseLSTM(NoiseAwareModel):
"""
LSTM model.

:param name: User-defined name of the model
:type name: str
"""

def forward(self, x, x_idx, f, w):
"""Forward function.

:param x: The sequence input (batch) of the model.
:type x: list of torch.Tensor of shape (sequence_len * batch_size)
:param x_idx: The feature indices for lstm output (batch) of the model.
:type x_idx: torch.Tensor of shape (lstm_out_size * batch_size)
:param f: The feature input of the model.
:type f: torch.Tensor of shape (batch_size * feature_size)
:param w: The input feature weight (batch) of the model.
:type w: torch.Tensor of shape (batch_size, num_classes)
:return: The output of LSTM layer.
:rtype: torch.Tensor of shape (batch_size, num_classes)
"""

batch_size = len(f)

outputs = (
torch.Tensor([]).cuda()
if self.settings["host_device"] in self._gpu
else torch.Tensor([])
)

# Calculate textual features from LSTMs
for i in range(len(x)):
state_word = self.lstms[0].init_hidden(batch_size)
output = self.lstms[0].forward(x[i][0], x[i][1], state_word)
outputs = torch.cat((outputs, output), 1)

# Concatenate textual features with multi-modal features
feaures = torch.cat((x_idx, f), 1)
weights = torch.cat((outputs, w), 1)

return self.sparse_linear(feaures, weights)

def _check_input(self, X):
"""Check input format.

:param X: The input data of the model.
:type X: (candidates, features) pair
:return: True if valid, otherwise False.
:rtype: bool
"""
return isinstance(X, tuple)

def _preprocess_data(self, X, Y=None, idxs=None, train=False):
"""
Preprocess the data:
1. Make sentence with mention into sequence data for LSTM.
2. Select subset of the input if idxs exists.

:param X: The input data of the model.
:type X: pair with candidates and corresponding features
:param Y: The labels of input data (optional).
:type Y: list of floats if num_classes = 2
otherwise num_classes-length numpy array
:param idxs: The selected indexs of input data.
:type idxs: list or numpy.array
:param train: An indicator for word dictionary to extend new words.
:type train: bool
:return: Preprocessed data.
:rtype: list of (candidate, features) pairs
"""

C, F = X

# Create word dictionary for LSTM
if not hasattr(self, "word_dict"):
self.word_dict = SymbolTable()
arity = len(C[0])
# Add paddings into word dictionary
for i in range(arity):
# TODO: optimize this
list(map(self.word_dict.get, ["~~[[" + str(i), str(i) + "]]~~"]))

# Make sequence input for LSTM from candidates
seq_data = []
for candidate in C:
cand_idx = []
for i in range(len(candidate)):
# Add mark for each mention in the original sentence
args = [
(
candidate[i].span.get_word_start_index(),
candidate[i].span.get_word_end_index(),
i,
)
]
s = mark_sentence(mention_to_tokens(candidate[i]), args)
f = self.word_dict.get if train else self.word_dict.lookup
cand_idx.append(list(map(f, s)))
seq_data.append(cand_idx)

# Generate proprcessed the input
if idxs is None:
if Y is not None:
return (
[
(
seq_data[i],
F.indices[F.indptr[i] : F.indptr[i + 1]],
F.data[F.indptr[i] : F.indptr[i + 1]],
)
for i in range(len(C))
],
Y,
)
else:
return [
(
seq_data[i],
F.indices[F.indptr[i] : F.indptr[i + 1]],
F.data[F.indptr[i] : F.indptr[i + 1]],
)
for i in range(len(C))
]
if Y is not None:
return (
[
(
seq_data[i],
F.indices[F.indptr[i] : F.indptr[i + 1]],
F.data[F.indptr[i] : F.indptr[i + 1]],
)
for i in idxs
],
Y[idxs],
)
else:
return [
(
seq_data[i],
F.indices[F.indptr[i] : F.indptr[i + 1]],
F.data[F.indptr[i] : F.indptr[i + 1]],
)
for i in idxs
]

def _update_settings(self, X):
"""
Update the model argument.

:param X: The input data of the model.
:type X: list of (candidate, features) pairs
"""

self.logger.info("Load defalut parameters for Sparse LSTM")
config = get_config()["learning"]["SparseLSTM"]

for key in config.keys():
if key not in self.settings:
self.settings[key] = config[key]

self.settings["relation_arity"] = len(X[0][0])
self.settings["lstm_dim"] = (
len(X[0][0])
* self.settings["hidden_dim"]
* (2 if self.settings["bidirectional"] else 1)
)

# Add one feature for padding vector (all 0s)
self.settings["input_dim"] = (
X[1].shape[1]
+ len(X[0][0])
* self.settings["hidden_dim"]
* (2 if self.settings["bidirectional"] else 1)
+ 1
)

def _build_model(self):
"""
Build the model.
"""
# Set up LSTM modules
self.lstms = nn.ModuleList(
[
RNN(
num_classes=0,
num_tokens=self.word_dict.s,
emb_size=self.settings["emb_dim"],
lstm_hidden=self.settings["hidden_dim"],
attention=self.settings["attention"],
dropout=self.settings["dropout"],
bidirectional=self.settings["bidirectional"],
use_cuda=self.settings["host_device"] in self._gpu,
)
]
* self.settings["relation_arity"]
)

if "input_dim" not in self.settings:
raise ValueError("Model parameter input_dim cannot be None.")

cardinality = self.cardinality if self.cardinality > 2 else 1

# Set up final linear layer
self.sparse_linear = SparseLinear(
self.settings["input_dim"], cardinality, self.settings["bias"]
)

def _calc_logits(self, X, batch_size=None):
"""
Calculate the logits.

:param X: The input data of the model.
:type X: list of (candidate, features) pairs
:param batch_size: The batch size.
:type batch_size: int
:return: The output logits of model.
:rtype: torch.Tensor of shape (batch_size, num_classes) if num_classes > 2
otherwise shape (batch_size, 1)
"""

# Generate LSTM input
C = np.array(list(zip(*X))[0])

# Check LSTM input dimension size matches the number of lstms in the model
assert len(C[0]) == len(self.lstms)

# Generate sparse multi-modal feature input
F = (
np.array(list(zip(*X))[1]) + self.settings["lstm_dim"] + 1
) # Correct the index since 0 is the padding and placeholder for lstm feature
V = np.array(list(zip(*X))[2])

outputs = (
torch.Tensor([]).cuda()
if self.settings["host_device"] in self._gpu
else torch.Tensor([])
)

n = len(F)
if batch_size is None:
batch_size = n
for batch_st in range(0, n, batch_size):
batch_ed = batch_st + batch_size if batch_st + batch_size <= n else n

# TODO: optimize this
sequences = []
# For loop each relation arity
for i in range(len(C[0])):
sequence = []
# Generate sequence for the batch
for j in range(batch_st, batch_ed):
sequence.append(C[j][i])
x, x_mask = pad_batch(sequence, self.settings["max_sentence_length"])
if self.settings["host_device"] in self._gpu:
x = x.cuda()
x_mask = x_mask.cuda()
sequences.append((x, x_mask))

lstm_weight_indices = torch.as_tensor(
np.arange(1, self.settings["lstm_dim"] + 1)
).repeat(batch_ed - batch_st, 1)

features, _ = pad_batch(F[batch_st:batch_ed], 0)
values, _ = pad_batch(V[batch_st:batch_ed], 0, type="float")

if self.settings["host_device"] in self._gpu:
lstm_weight_indices = lstm_weight_indices.cuda()
features = features.cuda()
values = values.cuda()

output = self.forward(sequences, lstm_weight_indices, features, values)
if self.cardinality == 2:
outputs = torch.cat((outputs, output.view(-1)), 0)
else:
outputs = torch.cat((outputs, output), 0)

return outputs
Loading