From b1902e56314218c0e3593528bba348664f06ec8e Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Wed, 28 Dec 2016 21:58:44 +0100 Subject: [PATCH] Utils and Matutils changes (#1062) * Utils, Matutils changes * Changed comments * changed matutils import --- gensim/matutils.py | 143 +++++++++++++++++++++----------------- gensim/models/hdpmodel.py | 37 ++++------ gensim/models/ldamodel.py | 13 +--- gensim/utils.py | 63 +++++++++-------- 4 files changed, 124 insertions(+), 132 deletions(-) diff --git a/gensim/matutils.py b/gensim/matutils.py index 4c92be4562..a9d73c3fd2 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -16,11 +16,12 @@ from gensim import utils -import numpy +import numpy as np import scipy.sparse from scipy.stats import entropy import scipy.linalg from scipy.linalg.lapack import get_lapack_funcs +from scipy.special import psi # gamma function utils from six import iteritems, itervalues, string_types from six.moves import xrange, zip as izip @@ -33,13 +34,13 @@ from scipy.linalg.special_matrices import triu try: - from numpy import triu_indices + from np import triu_indices except ImportError: - # numpy < 1.4 + # np < 1.4 def triu_indices(n, k=0): - m = numpy.ones((n, n), int) + m = np.ones((n, n), int) a = triu(m, k) - return numpy.where(a != 0) + return np.where(a != 0) blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] @@ -53,21 +54,21 @@ def argsort(x, topn=None, reverse=False): If reverse is True, return the greatest elements instead, in descending order. """ - x = numpy.asarray(x) # unify code path for when `x` is not a numpy array (list, tuple...) + x = np.asarray(x) # unify code path for when `x` is not a np array (list, tuple...) if topn is None: topn = x.size if topn <= 0: return [] if reverse: x = -x - if topn >= x.size or not hasattr(numpy, 'argpartition'): - return numpy.argsort(x)[:topn] - # numpy >= 1.8 has a fast partial argsort, use that! - most_extreme = numpy.argpartition(x, topn)[:topn] - return most_extreme.take(numpy.argsort(x.take(most_extreme))) # resort topn into order + if topn >= x.size or not hasattr(np, 'argpartition'): + return np.argsort(x)[:topn] + # np >= 1.8 has a fast partial argsort, use that! + most_extreme = np.argpartition(x, topn)[:topn] + return most_extreme.take(np.argsort(x.take(most_extreme))) # resort topn into order -def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_nnz=None, printprogress=0): +def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0): """ Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format, with documents as columns. @@ -96,8 +97,8 @@ def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_n if num_terms is not None and num_docs is not None and num_nnz is not None: # faster and much more memory-friendly version of creating the sparse csc posnow, indptr = 0, [0] - indices = numpy.empty((num_nnz,), dtype=numpy.int32) # HACK assume feature ids fit in 32bit integer - data = numpy.empty((num_nnz,), dtype=dtype) + indices = np.empty((num_nnz,), dtype=np.int32) # HACK assume feature ids fit in 32bit integer + data = np.empty((num_nnz,), dtype=dtype) for docno, doc in enumerate(corpus): if printprogress and docno % printprogress == 0: logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs)) @@ -122,15 +123,15 @@ def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_n num_terms = max(indices) + 1 if indices else 0 num_docs = len(indptr) - 1 # now num_docs, num_terms and num_nnz contain the correct values - data = numpy.asarray(data, dtype=dtype) - indices = numpy.asarray(indices) + data = np.asarray(data, dtype=dtype) + indices = np.asarray(indices) result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) return result def pad(mat, padrow, padcol): """ - Add additional rows/columns to a numpy.matrix `mat`. The new rows/columns + Add additional rows/columns to a np.matrix `mat`. The new rows/columns will be initialized with zeros. """ if padrow < 0: @@ -138,36 +139,36 @@ def pad(mat, padrow, padcol): if padcol < 0: padcol = 0 rows, cols = mat.shape - return numpy.bmat([[mat, numpy.matrix(numpy.zeros((rows, padcol)))], - [numpy.matrix(numpy.zeros((padrow, cols + padcol)))]]) + return np.bmat([[mat, np.matrix(np.zeros((rows, padcol)))], + [np.matrix(np.zeros((padrow, cols + padcol)))]]) def zeros_aligned(shape, dtype, order='C', align=128): - """Like `numpy.zeros()`, but the array will be aligned at `align` byte boundary.""" - nbytes = numpy.prod(shape, dtype=numpy.int64) * numpy.dtype(dtype).itemsize - buffer = numpy.zeros(nbytes + align, dtype=numpy.uint8) # problematic on win64 ("maximum allowed dimension exceeded") + """Like `np.zeros()`, but the array will be aligned at `align` byte boundary.""" + nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize + buffer = np.zeros(nbytes + align, dtype=np.uint8) # problematic on win64 ("maximum allowed dimension exceeded") start_index = -buffer.ctypes.data % align return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order) def ismatrix(m): - return isinstance(m, numpy.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) + return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) def any2sparse(vec, eps=1e-9): - """Convert a numpy/scipy vector into gensim document format (=list of 2-tuples).""" - if isinstance(vec, numpy.ndarray): + """Convert a np/scipy vector into gensim document format (=list of 2-tuples).""" + if isinstance(vec, np.ndarray): return dense2vec(vec, eps) if scipy.sparse.issparse(vec): return scipy2sparse(vec, eps) - return [(int(fid), float(fw)) for fid, fw in vec if numpy.abs(fw) > eps] + return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps] def scipy2sparse(vec, eps=1e-9): """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples).""" vec = vec.tocsr() assert vec.shape[0] == 1 - return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps] + return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps] class Scipy2Corpus(object): @@ -179,15 +180,15 @@ class Scipy2Corpus(object): """ def __init__(self, vecs): """ - `vecs` is a sequence of dense and/or sparse vectors, such as a 2d numpy array, - or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d numpy/scipy vectors. + `vecs` is a sequence of dense and/or sparse vectors, such as a 2d np array, + or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d np/scipy vectors. """ self.vecs = vecs def __iter__(self): for vec in self.vecs: - if isinstance(vec, numpy.ndarray): + if isinstance(vec, np.ndarray): yield full2sparse(vec) else: yield scipy2sparse(vec) @@ -199,12 +200,12 @@ def __len__(self): def sparse2full(doc, length): """ Convert a document in sparse document format (=sequence of 2-tuples) into a dense - numpy array (of size `length`). + np array (of size `length`). This is the mirror function to `full2sparse`. """ - result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value) + result = np.zeros(length, dtype=np.float32) # fill with zeroes (default value) doc = dict(doc) # overwrite some of the zeroes with explicit values result[list(doc)] = list(itervalues(doc)) @@ -213,15 +214,15 @@ def sparse2full(doc, length): def full2sparse(vec, eps=1e-9): """ - Convert a dense numpy array into the sparse document format (sequence of 2-tuples). + Convert a dense np array into the sparse document format (sequence of 2-tuples). Values of magnitude < `eps` are treated as zero (ignored). This is the mirror function to `sparse2full`. """ - vec = numpy.asarray(vec, dtype=float) - nnz = numpy.nonzero(abs(vec) > eps)[0] + vec = np.asarray(vec, dtype=float) + nnz = np.nonzero(abs(vec) > eps)[0] return list(zip(nnz, vec.take(nnz))) dense2vec = full2sparse @@ -232,19 +233,19 @@ def full2sparse_clipped(vec, topn, eps=1e-9): Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs). """ - # use numpy.argpartition/argsort and only form tuples that are actually returned. + # use np.argpartition/argsort and only form tuples that are actually returned. # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on. if topn <= 0: return [] - vec = numpy.asarray(vec, dtype=float) - nnz = numpy.nonzero(abs(vec) > eps)[0] + vec = np.asarray(vec, dtype=float) + nnz = np.nonzero(abs(vec) > eps)[0] biggest = nnz.take(argsort(abs(vec).take(nnz), topn, reverse=True)) return list(zip(biggest, vec.take(biggest))) -def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32): +def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32): """ - Convert corpus into a dense numpy array (documents will be columns). You + Convert corpus into a dense np array (documents will be columns). You must supply the number of features `num_terms`, because dimensionality cannot be deduced from the sparse vectors alone. @@ -256,19 +257,19 @@ def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32): """ if num_docs is not None: # we know the number of documents => don't bother column_stacking - docno, result = -1, numpy.empty((num_terms, num_docs), dtype=dtype) + docno, result = -1, np.empty((num_terms, num_docs), dtype=dtype) for docno, doc in enumerate(corpus): result[:, docno] = sparse2full(doc, num_terms) assert docno + 1 == num_docs else: - result = numpy.column_stack(sparse2full(doc, num_terms) for doc in corpus) + result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus) return result.astype(dtype) class Dense2Corpus(object): """ - Treat dense numpy array as a sparse, streamed gensim corpus. + Treat dense np array as a sparse, streamed gensim corpus. No data copy is made (changes to the underlying matrix imply changes in the corpus). @@ -329,18 +330,18 @@ def ret_normalized_vec(vec, length): def ret_log_normalize_vec(vec, axis=1): log_max = 100.0 if len(vec.shape) == 1: - max_val = numpy.max(vec) - log_shift = log_max - numpy.log(len(vec) + 1.0) - max_val - tot = numpy.sum(numpy.exp(vec + log_shift)) - log_norm = numpy.log(tot) - log_shift + max_val = np.max(vec) + log_shift = log_max - np.log(len(vec) + 1.0) - max_val + tot = np.sum(np.exp(vec + log_shift)) + log_norm = np.log(tot) - log_shift vec = vec - log_norm else: if axis == 1: # independently normalize each sample - max_val = numpy.max(vec, 1) - log_shift = log_max - numpy.log(vec.shape[1] + 1.0) - max_val - tot = numpy.sum(numpy.exp(vec + log_shift[:, numpy.newaxis]), 1) - log_norm = numpy.log(tot) - log_shift - vec = vec - log_norm[:, numpy.newaxis] + max_val = np.max(vec, 1) + log_shift = log_max - np.log(vec.shape[1] + 1.0) - max_val + tot = np.sum(np.exp(vec + log_shift[:, np.newaxis]), 1) + log_norm = np.log(tot) - log_shift + vec = vec - log_norm[:, np.newaxis] elif axis == 0: # normalize each feature k = ret_log_normalize_vec(vec.T) return (k[0].T, k[1]) @@ -349,8 +350,8 @@ def ret_log_normalize_vec(vec, axis=1): return (vec, log_norm) -blas_nrm2 = blas('nrm2', numpy.array([], dtype=float)) -blas_scal = blas('scal', numpy.array([], dtype=float)) +blas_nrm2 = blas('nrm2', np.array([], dtype=float)) +blas_scal = blas('scal', np.array([], dtype=float)) def unitvec(vec, norm='l2'): @@ -359,25 +360,25 @@ def unitvec(vec, norm='l2'): is returned back unchanged. Output will be in the same format as input (i.e., gensim vector=>gensim vector, - or numpy array=>numpy array, scipy.sparse=>scipy.sparse). + or np array=>np array, scipy.sparse=>scipy.sparse). """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) if scipy.sparse.issparse(vec): vec = vec.tocsr() if norm == 'l1': - veclen = numpy.sum(numpy.abs(vec.data)) + veclen = np.sum(np.abs(vec.data)) if norm == 'l2': - veclen = numpy.sqrt(numpy.sum(vec.data ** 2)) + veclen = np.sqrt(np.sum(vec.data ** 2)) if veclen > 0.0: return vec / veclen else: return vec - if isinstance(vec, numpy.ndarray): - vec = numpy.asarray(vec, dtype=float) + if isinstance(vec, np.ndarray): + vec = np.asarray(vec, dtype=float) if norm == 'l1': - veclen = numpy.sum(numpy.abs(vec)) + veclen = np.sum(np.abs(vec)) if norm == 'l2': veclen = blas_nrm2(vec) if veclen > 0.0: @@ -481,10 +482,10 @@ def hellinger(vec1, vec2): vec1, vec2 = dict(vec1), dict(vec2) if len(vec2) < len(vec1): vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector - sim = numpy.sqrt(0.5*sum((numpy.sqrt(value) - numpy.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) + sim = np.sqrt(0.5*sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) return sim else: - sim = numpy.sqrt(0.5 * ((numpy.sqrt(vec1) - numpy.sqrt(vec2))**2).sum()) + sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum()) return sim @@ -514,9 +515,9 @@ def jaccard(vec1, vec2): return 1 - float(intersection) / float(union) else: # if it isn't in bag of words format, we can use sets to calculate intersection and union - if isinstance(vec1, numpy.ndarray): + if isinstance(vec1, np.ndarray): vec1 = vec1.tolist() - if isinstance(vec2, numpy.ndarray): + if isinstance(vec2, np.ndarray): vec2 = vec2.tolist() vec1 = set(vec1) vec2 = set(vec2) @@ -525,6 +526,18 @@ def jaccard(vec1, vec2): return 1 - float(len(intersection)) / float(len(union)) +def dirichlet_expectation(alpha): + """ + For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`. + + """ + if (len(alpha.shape) == 1): + result = psi(alpha) - psi(np.sum(alpha)) + else: + result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis] + return result.astype(alpha.dtype) # keep the same precision as input + + def qr_destroy(la): """ Return QR decomposition of `la[0]`. Content of `la` gets destroyed in the process. @@ -532,7 +545,7 @@ def qr_destroy(la): Using this function should be less memory intense than calling `scipy.linalg.qr(la[0])`, because the memory used in `la[0]` is reclaimed earlier. """ - a = numpy.asfortranarray(la[0]) + a = np.asfortranarray(la[0]) del la[0], la # now `a` is the only reference to the input matrix m, n = a.shape # perform q, r = QR(a); code hacked out of scipy.linalg.qr diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 71b37437ca..310ddb179a 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -35,9 +35,10 @@ import logging, time import numpy as np -import scipy.special as sp +from scipy.special import gammaln, psi # gamma function utils from gensim import interfaces, utils, matutils +from gensim.matutils import dirichlet_expectation from gensim.models import basemodel, ldamodel from six.moves import xrange @@ -47,23 +48,13 @@ rhot_bound = 0.0 -def dirichlet_expectation(alpha): - """ - For a vector theta ~ Dir(alpha), compute E[log(theta)] given alpha. - """ - if (len(alpha.shape) == 1): - return(sp.psi(alpha) - sp.psi(np.sum(alpha))) - return(sp.psi(alpha) - sp.psi(np.sum(alpha, 1))[:, np.newaxis]) - - - def expect_log_sticks(sticks): """ For stick-breaking hdp, return the E[log(sticks)] """ - dig_sum = sp.psi(np.sum(sticks, 0)) - ElogW = sp.psi(sticks[0]) - dig_sum - Elog1_W = sp.psi(sticks[1]) - dig_sum + dig_sum = psi(np.sum(sticks, 0)) + ElogW = psi(sticks[0]) - dig_sum + Elog1_W = psi(sticks[1]) - dig_sum n = len(sticks[0]) + 1 Elogsticks = np.zeros(n) @@ -91,8 +82,8 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): likelihood = np.sum(counts * np.log(phinorm)) likelihood += np.sum((alpha - gamma) * Elogtheta) - likelihood += np.sum(sp.gammaln(gamma) - sp.gammaln(alpha)) - likelihood += sp.gammaln(np.sum(alpha)) - sp.gammaln(np.sum(gamma)) + likelihood += np.sum(gammaln(gamma) - gammaln(alpha)) + likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma)) return (likelihood, gamma) @@ -276,8 +267,8 @@ def update_chunk(self, chunk, update=True, opt_o=True): rw = np.array([self.m_r[t] for t in self.m_timestamp[word_list]]) self.m_lambda[:, word_list] *= np.exp(self.m_r[-1] - rw) self.m_Elogbeta[:, word_list] = \ - sp.psi(self.m_eta + self.m_lambda[:, word_list]) - \ - sp.psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) + psi(self.m_eta + self.m_lambda[:, word_list]) - \ + psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) ss = SuffStats(self.m_T, Wt, len(chunk)) @@ -363,9 +354,9 @@ def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K - 1) * log_alpha - dig_sum = sp.psi(np.sum(v, 0)) - likelihood += np.sum((np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (sp.psi(v) - dig_sum)) - likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum(sp.gammaln(v)) + dig_sum = psi(np.sum(v, 0)) + likelihood += np.sum((np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (psi(v) - dig_sum)) + likelihood -= np.sum(gammaln(np.sum(v, 0))) - np.sum(gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) @@ -439,8 +430,8 @@ def update_expectations(self): for w in xrange(self.m_W): self.m_lambda[:, w] *= np.exp(self.m_r[-1] - self.m_r[self.m_timestamp[w]]) - self.m_Elogbeta = sp.psi(self.m_eta + self.m_lambda) - \ - sp.psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) + self.m_Elogbeta = psi(self.m_eta + self.m_lambda) - \ + psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) self.m_timestamp[:] = self.m_updatect self.m_status_up_to_date = True diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 4bce54f4fc..58451282ef 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -37,6 +37,7 @@ import os from gensim import interfaces, utils, matutils +from gensim.matutils import dirichlet_expectation from gensim.models import basemodel from itertools import chain @@ -57,18 +58,6 @@ logger = logging.getLogger('gensim.models.ldamodel') -def dirichlet_expectation(alpha): - """ - For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`. - - """ - if (len(alpha.shape) == 1): - result = psi(alpha) - psi(np.sum(alpha)) - else: - result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis] - return result.astype(alpha.dtype) # keep the same precision as input - - def update_dir_prior(prior, N, logphat, rho): """ Updates a given prior using Newton's method, described in diff --git a/gensim/utils.py b/gensim/utils.py index 0bdccc46f7..133650bc42 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -36,7 +36,7 @@ from contextlib import contextmanager import subprocess -import numpy +import numpy as np import numbers import scipy.sparse @@ -86,14 +86,13 @@ def get_random_state(seed): Method originally from maciejkula/glove-python, and written by @joshloyal """ - if seed is None or seed is numpy.random: - return numpy.random.mtrand._rand - if isinstance(seed, (numbers.Integral, numpy.integer)): - return numpy.random.RandomState(seed) - if isinstance(seed, numpy.random.RandomState): + if seed is None or seed is np.random: + return np.random.mtrand._rand + if isinstance(seed, (numbers.Integral, np.integer)): + return np.random.RandomState(seed) + if isinstance(seed, np.random.RandomState): return seed - raise ValueError('%r cannot be used to seed a numpy.random.RandomState' - ' instance' % seed) + raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed) def synchronous(tlockname): @@ -294,9 +293,9 @@ def _load_specials(self, fname, mmap, compress, subname): if mmap: raise mmap_error(attrib, subname(fname, attrib)) - val = numpy.load(subname(fname, attrib))['val'] + val = np.load(subname(fname, attrib))['val'] else: - val = numpy.load(subname(fname, attrib), mmap_mode=mmap) + val = np.load(subname(fname, attrib), mmap_mode=mmap) setattr(self, attrib, val) @@ -308,14 +307,14 @@ def _load_specials(self, fname, mmap, compress, subname): if mmap: raise mmap_error(attrib, subname(fname, attrib)) - with numpy.load(subname(fname, attrib, 'sparse')) as f: + with np.load(subname(fname, attrib, 'sparse')) as f: sparse.data = f['data'] sparse.indptr = f['indptr'] sparse.indices = f['indices'] else: - sparse.data = numpy.load(subname(fname, attrib, 'data'), mmap_mode=mmap) - sparse.indptr = numpy.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap) - sparse.indices = numpy.load(subname(fname, attrib, 'indices'), mmap_mode=mmap) + sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap) + sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap) + sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap) setattr(self, attrib, sparse) @@ -391,7 +390,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, if separately is None: separately = [] for attrib, val in iteritems(self.__dict__): - if isinstance(val, numpy.ndarray) and val.size >= sep_limit: + if isinstance(val, np.ndarray) and val.size >= sep_limit: separately.append(attrib) elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit: separately.append(attrib) @@ -414,15 +413,15 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, try: numpys, scipys, ignoreds = [], [], [] for attrib, val in iteritems(asides): - if isinstance(val, numpy.ndarray) and attrib not in ignore: + if isinstance(val, np.ndarray) and attrib not in ignore: numpys.append(attrib) - logger.info("storing numpy array '%s' to %s" % ( + logger.info("storing np array '%s' to %s" % ( attrib, subname(fname, attrib))) if compress: - numpy.savez_compressed(subname(fname, attrib), val=numpy.ascontiguousarray(val)) + np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val)) else: - numpy.save(subname(fname, attrib), numpy.ascontiguousarray(val)) + np.save(subname(fname, attrib), np.ascontiguousarray(val)) elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: scipys.append(attrib) @@ -430,14 +429,14 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, attrib, subname(fname, attrib))) if compress: - numpy.savez_compressed(subname(fname, attrib, 'sparse'), + np.savez_compressed(subname(fname, attrib, 'sparse'), data=val.data, indptr=val.indptr, indices=val.indices) else: - numpy.save(subname(fname, attrib, 'data'), val.data) - numpy.save(subname(fname, attrib, 'indptr'), val.indptr) - numpy.save(subname(fname, attrib, 'indices'), val.indices) + np.save(subname(fname, attrib, 'data'), val.data) + np.save(subname(fname, attrib, 'indptr'), val.indptr) + np.save(subname(fname, attrib, 'indices'), val.indices) data, indptr, indices = val.data, val.indptr, val.indices val.data, val.indptr, val.indices = None, None, None @@ -714,11 +713,11 @@ def __init__(self, corpus, slice_): Negative slicing can only be used if the corpus is indexable. Otherwise, the corpus will be iterated over. - Slice can also be a numpy.ndarray to support fancy indexing. + Slice can also be a np.ndarray to support fancy indexing. NOTE: calculating the size of a SlicedCorpus is expensive when using a slice as the corpus has to be iterated over once. - Using a list or numpy.ndarray does not have this drawback, but + Using a list or np.ndarray does not have this drawback, but consumes more memory. """ self.corpus = corpus @@ -736,7 +735,7 @@ def __iter__(self): def __len__(self): # check cached length, calculate if needed if self.length is None: - if isinstance(self.slice_, (list, numpy.ndarray)): + if isinstance(self.slice_, (list, np.ndarray)): self.length = len(self.slice_) else: self.length = sum(1 for x in self) @@ -801,13 +800,13 @@ def chunkize_serial(iterable, chunksize, as_numpy=False): [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] """ - import numpy + import numpy as np it = iter(iterable) while True: if as_numpy: # convert each document to a 2d numpy array (~6x faster when transmitting # chunk data over the wire, in Pyro) - wrapped_chunk = [[numpy.array(doc) for doc in itertools.islice(it, int(chunksize))]] + wrapped_chunk = [[np.array(doc) for doc in itertools.islice(it, int(chunksize))]] else: wrapped_chunk = [list(itertools.islice(it, int(chunksize)))] if not wrapped_chunk[0]: @@ -829,7 +828,7 @@ def __init__(self, q, corpus, chunksize, maxsize, as_numpy): def run(self): if self.as_numpy: - import numpy # don't clutter the global namespace with a dependency on numpy + import np # don't clutter the global namespace with a dependency on numpy it = iter(self.corpus) while True: chunk = itertools.islice(it, self.chunksize) @@ -837,7 +836,7 @@ def run(self): # HACK XXX convert documents to numpy arrays, to save memory. # This also gives a scipy warning at runtime: # "UserWarning: indices array has non-integer dtype (float64)" - wrapped_chunk = [[numpy.asarray(doc) for doc in chunk]] + wrapped_chunk = [[np.asarray(doc) for doc in chunk]] else: wrapped_chunk = [list(chunk)] @@ -1088,8 +1087,8 @@ def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0): a Poisson distribution with parameter lambda equal to `lam`. """ - nnz = numpy.random.uniform(size=(dim,)) - data = [(i, float(numpy.random.poisson(lam=lam) + 1.0)) + nnz = np.random.uniform(size=(dim,)) + data = [(i, float(np.random.poisson(lam=lam) + 1.0)) for i in xrange(dim) if nnz[i] < prob_nnz] return data