Skip to content

Commit

Permalink
Utils and Matutils changes (piskvorky#1062)
Browse files Browse the repository at this point in the history
* Utils, Matutils changes

* Changed comments

* changed matutils import
  • Loading branch information
bhargavvader authored and jayantj committed Jan 4, 2017
1 parent bb725b6 commit b1902e5
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 132 deletions.
143 changes: 78 additions & 65 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@

from gensim import utils

import numpy
import numpy as np
import scipy.sparse
from scipy.stats import entropy
import scipy.linalg
from scipy.linalg.lapack import get_lapack_funcs
from scipy.special import psi # gamma function utils

from six import iteritems, itervalues, string_types
from six.moves import xrange, zip as izip
Expand All @@ -33,13 +34,13 @@
from scipy.linalg.special_matrices import triu

try:
from numpy import triu_indices
from np import triu_indices
except ImportError:
# numpy < 1.4
# np < 1.4
def triu_indices(n, k=0):
m = numpy.ones((n, n), int)
m = np.ones((n, n), int)
a = triu(m, k)
return numpy.where(a != 0)
return np.where(a != 0)

blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]

Expand All @@ -53,21 +54,21 @@ def argsort(x, topn=None, reverse=False):
If reverse is True, return the greatest elements instead, in descending order.
"""
x = numpy.asarray(x) # unify code path for when `x` is not a numpy array (list, tuple...)
x = np.asarray(x) # unify code path for when `x` is not a np array (list, tuple...)
if topn is None:
topn = x.size
if topn <= 0:
return []
if reverse:
x = -x
if topn >= x.size or not hasattr(numpy, 'argpartition'):
return numpy.argsort(x)[:topn]
# numpy >= 1.8 has a fast partial argsort, use that!
most_extreme = numpy.argpartition(x, topn)[:topn]
return most_extreme.take(numpy.argsort(x.take(most_extreme))) # resort topn into order
if topn >= x.size or not hasattr(np, 'argpartition'):
return np.argsort(x)[:topn]
# np >= 1.8 has a fast partial argsort, use that!
most_extreme = np.argpartition(x, topn)[:topn]
return most_extreme.take(np.argsort(x.take(most_extreme))) # resort topn into order


def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_nnz=None, printprogress=0):
def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):
"""
Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format,
with documents as columns.
Expand Down Expand Up @@ -96,8 +97,8 @@ def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_n
if num_terms is not None and num_docs is not None and num_nnz is not None:
# faster and much more memory-friendly version of creating the sparse csc
posnow, indptr = 0, [0]
indices = numpy.empty((num_nnz,), dtype=numpy.int32) # HACK assume feature ids fit in 32bit integer
data = numpy.empty((num_nnz,), dtype=dtype)
indices = np.empty((num_nnz,), dtype=np.int32) # HACK assume feature ids fit in 32bit integer
data = np.empty((num_nnz,), dtype=dtype)
for docno, doc in enumerate(corpus):
if printprogress and docno % printprogress == 0:
logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs))
Expand All @@ -122,52 +123,52 @@ def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_n
num_terms = max(indices) + 1 if indices else 0
num_docs = len(indptr) - 1
# now num_docs, num_terms and num_nnz contain the correct values
data = numpy.asarray(data, dtype=dtype)
indices = numpy.asarray(indices)
data = np.asarray(data, dtype=dtype)
indices = np.asarray(indices)
result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
return result


def pad(mat, padrow, padcol):
"""
Add additional rows/columns to a numpy.matrix `mat`. The new rows/columns
Add additional rows/columns to a np.matrix `mat`. The new rows/columns
will be initialized with zeros.
"""
if padrow < 0:
padrow = 0
if padcol < 0:
padcol = 0
rows, cols = mat.shape
return numpy.bmat([[mat, numpy.matrix(numpy.zeros((rows, padcol)))],
[numpy.matrix(numpy.zeros((padrow, cols + padcol)))]])
return np.bmat([[mat, np.matrix(np.zeros((rows, padcol)))],
[np.matrix(np.zeros((padrow, cols + padcol)))]])


def zeros_aligned(shape, dtype, order='C', align=128):
"""Like `numpy.zeros()`, but the array will be aligned at `align` byte boundary."""
nbytes = numpy.prod(shape, dtype=numpy.int64) * numpy.dtype(dtype).itemsize
buffer = numpy.zeros(nbytes + align, dtype=numpy.uint8) # problematic on win64 ("maximum allowed dimension exceeded")
"""Like `np.zeros()`, but the array will be aligned at `align` byte boundary."""
nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize
buffer = np.zeros(nbytes + align, dtype=np.uint8) # problematic on win64 ("maximum allowed dimension exceeded")
start_index = -buffer.ctypes.data % align
return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order)


def ismatrix(m):
return isinstance(m, numpy.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m)
return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m)


def any2sparse(vec, eps=1e-9):
"""Convert a numpy/scipy vector into gensim document format (=list of 2-tuples)."""
if isinstance(vec, numpy.ndarray):
"""Convert a np/scipy vector into gensim document format (=list of 2-tuples)."""
if isinstance(vec, np.ndarray):
return dense2vec(vec, eps)
if scipy.sparse.issparse(vec):
return scipy2sparse(vec, eps)
return [(int(fid), float(fw)) for fid, fw in vec if numpy.abs(fw) > eps]
return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps]


def scipy2sparse(vec, eps=1e-9):
"""Convert a scipy.sparse vector into gensim document format (=list of 2-tuples)."""
vec = vec.tocsr()
assert vec.shape[0] == 1
return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps]
return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps]


class Scipy2Corpus(object):
Expand All @@ -179,15 +180,15 @@ class Scipy2Corpus(object):
"""
def __init__(self, vecs):
"""
`vecs` is a sequence of dense and/or sparse vectors, such as a 2d numpy array,
or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d numpy/scipy vectors.
`vecs` is a sequence of dense and/or sparse vectors, such as a 2d np array,
or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d np/scipy vectors.
"""
self.vecs = vecs

def __iter__(self):
for vec in self.vecs:
if isinstance(vec, numpy.ndarray):
if isinstance(vec, np.ndarray):
yield full2sparse(vec)
else:
yield scipy2sparse(vec)
Expand All @@ -199,12 +200,12 @@ def __len__(self):
def sparse2full(doc, length):
"""
Convert a document in sparse document format (=sequence of 2-tuples) into a dense
numpy array (of size `length`).
np array (of size `length`).
This is the mirror function to `full2sparse`.
"""
result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value)
result = np.zeros(length, dtype=np.float32) # fill with zeroes (default value)
doc = dict(doc)
# overwrite some of the zeroes with explicit values
result[list(doc)] = list(itervalues(doc))
Expand All @@ -213,15 +214,15 @@ def sparse2full(doc, length):

def full2sparse(vec, eps=1e-9):
"""
Convert a dense numpy array into the sparse document format (sequence of 2-tuples).
Convert a dense np array into the sparse document format (sequence of 2-tuples).
Values of magnitude < `eps` are treated as zero (ignored).
This is the mirror function to `sparse2full`.
"""
vec = numpy.asarray(vec, dtype=float)
nnz = numpy.nonzero(abs(vec) > eps)[0]
vec = np.asarray(vec, dtype=float)
nnz = np.nonzero(abs(vec) > eps)[0]
return list(zip(nnz, vec.take(nnz)))

dense2vec = full2sparse
Expand All @@ -232,19 +233,19 @@ def full2sparse_clipped(vec, topn, eps=1e-9):
Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs).
"""
# use numpy.argpartition/argsort and only form tuples that are actually returned.
# use np.argpartition/argsort and only form tuples that are actually returned.
# this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on.
if topn <= 0:
return []
vec = numpy.asarray(vec, dtype=float)
nnz = numpy.nonzero(abs(vec) > eps)[0]
vec = np.asarray(vec, dtype=float)
nnz = np.nonzero(abs(vec) > eps)[0]
biggest = nnz.take(argsort(abs(vec).take(nnz), topn, reverse=True))
return list(zip(biggest, vec.take(biggest)))


def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32):
def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32):
"""
Convert corpus into a dense numpy array (documents will be columns). You
Convert corpus into a dense np array (documents will be columns). You
must supply the number of features `num_terms`, because dimensionality
cannot be deduced from the sparse vectors alone.
Expand All @@ -256,19 +257,19 @@ def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32):
"""
if num_docs is not None:
# we know the number of documents => don't bother column_stacking
docno, result = -1, numpy.empty((num_terms, num_docs), dtype=dtype)
docno, result = -1, np.empty((num_terms, num_docs), dtype=dtype)
for docno, doc in enumerate(corpus):
result[:, docno] = sparse2full(doc, num_terms)
assert docno + 1 == num_docs
else:
result = numpy.column_stack(sparse2full(doc, num_terms) for doc in corpus)
result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus)
return result.astype(dtype)



class Dense2Corpus(object):
"""
Treat dense numpy array as a sparse, streamed gensim corpus.
Treat dense np array as a sparse, streamed gensim corpus.
No data copy is made (changes to the underlying matrix imply changes in the
corpus).
Expand Down Expand Up @@ -329,18 +330,18 @@ def ret_normalized_vec(vec, length):
def ret_log_normalize_vec(vec, axis=1):
log_max = 100.0
if len(vec.shape) == 1:
max_val = numpy.max(vec)
log_shift = log_max - numpy.log(len(vec) + 1.0) - max_val
tot = numpy.sum(numpy.exp(vec + log_shift))
log_norm = numpy.log(tot) - log_shift
max_val = np.max(vec)
log_shift = log_max - np.log(len(vec) + 1.0) - max_val
tot = np.sum(np.exp(vec + log_shift))
log_norm = np.log(tot) - log_shift
vec = vec - log_norm
else:
if axis == 1: # independently normalize each sample
max_val = numpy.max(vec, 1)
log_shift = log_max - numpy.log(vec.shape[1] + 1.0) - max_val
tot = numpy.sum(numpy.exp(vec + log_shift[:, numpy.newaxis]), 1)
log_norm = numpy.log(tot) - log_shift
vec = vec - log_norm[:, numpy.newaxis]
max_val = np.max(vec, 1)
log_shift = log_max - np.log(vec.shape[1] + 1.0) - max_val
tot = np.sum(np.exp(vec + log_shift[:, np.newaxis]), 1)
log_norm = np.log(tot) - log_shift
vec = vec - log_norm[:, np.newaxis]
elif axis == 0: # normalize each feature
k = ret_log_normalize_vec(vec.T)
return (k[0].T, k[1])
Expand All @@ -349,8 +350,8 @@ def ret_log_normalize_vec(vec, axis=1):
return (vec, log_norm)


blas_nrm2 = blas('nrm2', numpy.array([], dtype=float))
blas_scal = blas('scal', numpy.array([], dtype=float))
blas_nrm2 = blas('nrm2', np.array([], dtype=float))
blas_scal = blas('scal', np.array([], dtype=float))


def unitvec(vec, norm='l2'):
Expand All @@ -359,25 +360,25 @@ def unitvec(vec, norm='l2'):
is returned back unchanged.
Output will be in the same format as input (i.e., gensim vector=>gensim vector,
or numpy array=>numpy array, scipy.sparse=>scipy.sparse).
or np array=>np array, scipy.sparse=>scipy.sparse).
"""
if norm not in ('l1', 'l2'):
raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm)
if scipy.sparse.issparse(vec):
vec = vec.tocsr()
if norm == 'l1':
veclen = numpy.sum(numpy.abs(vec.data))
veclen = np.sum(np.abs(vec.data))
if norm == 'l2':
veclen = numpy.sqrt(numpy.sum(vec.data ** 2))
veclen = np.sqrt(np.sum(vec.data ** 2))
if veclen > 0.0:
return vec / veclen
else:
return vec

if isinstance(vec, numpy.ndarray):
vec = numpy.asarray(vec, dtype=float)
if isinstance(vec, np.ndarray):
vec = np.asarray(vec, dtype=float)
if norm == 'l1':
veclen = numpy.sum(numpy.abs(vec))
veclen = np.sum(np.abs(vec))
if norm == 'l2':
veclen = blas_nrm2(vec)
if veclen > 0.0:
Expand Down Expand Up @@ -481,10 +482,10 @@ def hellinger(vec1, vec2):
vec1, vec2 = dict(vec1), dict(vec2)
if len(vec2) < len(vec1):
vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector
sim = numpy.sqrt(0.5*sum((numpy.sqrt(value) - numpy.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1)))
sim = np.sqrt(0.5*sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1)))
return sim
else:
sim = numpy.sqrt(0.5 * ((numpy.sqrt(vec1) - numpy.sqrt(vec2))**2).sum())
sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum())
return sim


Expand Down Expand Up @@ -514,9 +515,9 @@ def jaccard(vec1, vec2):
return 1 - float(intersection) / float(union)
else:
# if it isn't in bag of words format, we can use sets to calculate intersection and union
if isinstance(vec1, numpy.ndarray):
if isinstance(vec1, np.ndarray):
vec1 = vec1.tolist()
if isinstance(vec2, numpy.ndarray):
if isinstance(vec2, np.ndarray):
vec2 = vec2.tolist()
vec1 = set(vec1)
vec2 = set(vec2)
Expand All @@ -525,14 +526,26 @@ def jaccard(vec1, vec2):
return 1 - float(len(intersection)) / float(len(union))


def dirichlet_expectation(alpha):
"""
For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`.
"""
if (len(alpha.shape) == 1):
result = psi(alpha) - psi(np.sum(alpha))
else:
result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
return result.astype(alpha.dtype) # keep the same precision as input


def qr_destroy(la):
"""
Return QR decomposition of `la[0]`. Content of `la` gets destroyed in the process.
Using this function should be less memory intense than calling `scipy.linalg.qr(la[0])`,
because the memory used in `la[0]` is reclaimed earlier.
"""
a = numpy.asfortranarray(la[0])
a = np.asfortranarray(la[0])
del la[0], la # now `a` is the only reference to the input matrix
m, n = a.shape
# perform q, r = QR(a); code hacked out of scipy.linalg.qr
Expand Down
Loading

0 comments on commit b1902e5

Please sign in to comment.