-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix return dtype for matutils.unitvec
according to input dtype. Fix #1722
#1992
Changes from 29 commits
82b8d17
834b042
cf463b2
5656835
406ed66
e71afcd
fe36408
50d011c
f1a40ac
ead451f
ae03291
cab90a8
141833d
218fe42
5fd1004
80628c0
768226b
438f763
f73076a
cd50529
11b0dde
2e86529
30d6284
d9cfb0d
2e8eca1
6beec75
365a722
9327f68
a42cf38
1078bdd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -692,34 +692,42 @@ def unitvec(vec, norm='l2', return_norm=False): | |
""" | ||
if norm not in ('l1', 'l2'): | ||
raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) | ||
|
||
if scipy.sparse.issparse(vec): | ||
vec = vec.tocsr() | ||
if norm == 'l1': | ||
veclen = np.sum(np.abs(vec.data)) | ||
if norm == 'l2': | ||
veclen = np.sqrt(np.sum(vec.data ** 2)) | ||
if veclen > 0.0: | ||
if np.issubdtype(vec.dtype, np.int): | ||
vec = vec.astype(np.float) / veclen | ||
else: | ||
vec /= veclen | ||
vec = vec.astype(vec.dtype) | ||
if return_norm: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. revert existing code please (you shouldn't remove it) |
||
return vec / veclen, veclen | ||
return vec, veclen | ||
else: | ||
return vec / veclen | ||
return vec | ||
else: | ||
if return_norm: | ||
return vec, 1. | ||
else: | ||
return vec | ||
|
||
if isinstance(vec, np.ndarray): | ||
vec = np.asarray(vec, dtype=float) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that this line is needed especially for |
||
vec = np.asarray(vec, dtype=vec.dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe this have no sense (because later you'll cast it again) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree - this pretty much seems like a no-op, effectively. Does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yes, I forgot about this - I will remove it on the next commit. |
||
if norm == 'l1': | ||
veclen = np.sum(np.abs(vec)) | ||
if norm == 'l2': | ||
veclen = blas_nrm2(vec) | ||
if veclen > 0.0: | ||
if np.issubdtype(vec.dtype, np.int): | ||
vec = vec.astype(np.float) | ||
if return_norm: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This entire construct (and a similar construct above) seems to have some unnecessary redundant code. We could simplify to something like - if veclen > 0.0:
if np.issubdtype(vec.dtype, np.int):
vec = vec.astype(np.float)
if return_norm:
return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen
else:
return blas_scal(1.0 / veclen, vec).astype(vec.dtype) |
||
return blas_scal(1.0 / veclen, vec), veclen | ||
return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen | ||
else: | ||
return blas_scal(1.0 / veclen, vec) | ||
return blas_scal(1.0 / veclen, vec).astype(vec.dtype) | ||
else: | ||
if return_norm: | ||
return vec, 1 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
import logging | ||
import unittest | ||
import numpy as np | ||
from scipy import sparse | ||
from scipy.special import psi # gamma function utils | ||
|
||
import gensim.matutils as matutils | ||
|
@@ -141,6 +142,106 @@ def testDirichletExpectation(self): | |
self.assertTrue(np.allclose(known_good, test_values), msg) | ||
|
||
|
||
def manual_unitvec(vec): | ||
# manual unit vector calculation for UnitvecTestCase | ||
vec = vec.astype(np.float) | ||
if sparse.issparse(vec): | ||
vec_sum_of_squares = vec.multiply(vec) | ||
unit = 1. / np.sqrt(vec_sum_of_squares.sum()) | ||
return vec.multiply(unit) | ||
elif not sparse.issparse(vec): | ||
sum_vec_squared = np.sum(vec ** 2) | ||
vec /= np.sqrt(sum_vec_squared) | ||
return vec | ||
|
||
|
||
class UnitvecTestCase(unittest.TestCase): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for reorganizing the tests! Looks much better now IMO |
||
# test unitvec | ||
def test_sparse_npfloat32(self): | ||
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float32) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) | ||
self.assertEqual(input_vector.dtype, unit_vector.dtype) | ||
|
||
def test_sparse_npfloat64(self): | ||
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float64) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) | ||
self.assertEqual(input_vector.dtype, unit_vector.dtype) | ||
|
||
def test_sparse_npint32(self): | ||
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int32) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) | ||
self.assertTrue(np.issubdtype(unit_vector.dtype, float)) | ||
|
||
def test_sparse_npint64(self): | ||
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int64) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) | ||
self.assertTrue(np.issubdtype(unit_vector.dtype, float)) | ||
|
||
def test_dense_npfloat32(self): | ||
input_vector = np.random.uniform(size=(5,)).astype(np.float32) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector, man_unit_vector)) | ||
self.assertEqual(input_vector.dtype, unit_vector.dtype) | ||
|
||
def test_dense_npfloat64(self): | ||
input_vector = np.random.uniform(size=(5,)).astype(np.float64) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector, man_unit_vector)) | ||
self.assertEqual(input_vector.dtype, unit_vector.dtype) | ||
|
||
def test_dense_npint32(self): | ||
input_vector = np.random.randint(10, size=5).astype(np.int32) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector, man_unit_vector)) | ||
self.assertTrue(np.issubdtype(unit_vector.dtype, float)) | ||
|
||
def test_dense_npint64(self): | ||
input_vector = np.random.randint(10, size=5).astype(np.int32) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector, man_unit_vector)) | ||
self.assertTrue(np.issubdtype(unit_vector.dtype, float)) | ||
|
||
def test_sparse_python_float(self): | ||
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(float) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) | ||
self.assertEqual(input_vector.dtype, unit_vector.dtype) | ||
|
||
def test_sparse_python_int(self): | ||
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(int) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) | ||
self.assertTrue(np.issubdtype(unit_vector.dtype, float)) | ||
|
||
def test_dense_python_float(self): | ||
input_vector = np.random.uniform(size=(5,)).astype(float) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector, man_unit_vector)) | ||
self.assertEqual(input_vector.dtype, unit_vector.dtype) | ||
|
||
def test_dense_python_int(self): | ||
input_vector = np.random.randint(10, size=5).astype(int) | ||
unit_vector = matutils.unitvec(input_vector) | ||
man_unit_vector = manual_unitvec(input_vector) | ||
self.assertTrue(np.allclose(unit_vector, man_unit_vector)) | ||
self.assertTrue(np.issubdtype(unit_vector.dtype, float)) | ||
|
||
|
||
if __name__ == '__main__': | ||
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Slightly confusing. For consistency, I'd prefer -
Does that make sense?
In fact, if dividing by
veclen
cannot change the dtype (is this the case?), even something like -There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, sorry I'm really nitpicking here :)
But small things like this cause a slow decline in overall code quality over time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately dividing by
veclen
actually changes the dtype, causing the original problem to manifest itself (i.e. float32 inputs outputting float64). Even trying something like this:causes the same problem. This is why I divide by
veclen
and then cast the dtype. Do you have any suggestions to get around this?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, I really like your second suggestion, and it passes all tests:
That's very neat indeed, I'll commit it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, don't worry about the nitpicking! When I am experienced enough to pick up on such things I'm sure I will be nitpicking like this haha.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the fix and the explanation! Looks good.