diff --git a/.travis.yml b/.travis.yml
index b787be1d9..13129c782 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,6 +23,7 @@ install:
   # Replace dep1 dep2 ... with your dependencies
   - conda env create --file=ci/environment-3.6.yml --name=dask-ml
   - source activate dask-ml
+  - pip install pip --upgrade
   - python -m pip install -e .
 
 script:
diff --git a/ci/environment-2.7.yml b/ci/environment-2.7.yml
index 6ee02f127..4d9aec0f2 100644
--- a/ci/environment-2.7.yml
+++ b/ci/environment-2.7.yml
@@ -29,6 +29,7 @@ dependencies:
   - sphinx_rtd_theme
   - sphinx-gallery
   - tensorflow
+  - testpath<0.4
   - tornado
   - toolz
   - xgboost
diff --git a/ci/environment-3.6.yml b/ci/environment-3.6.yml
index 3d31bce29..ce1eaa717 100644
--- a/ci/environment-3.6.yml
+++ b/ci/environment-3.6.yml
@@ -37,6 +37,7 @@ dependencies:
   - sphinx_rtd_theme
   - sphinx-gallery
   - tensorflow
+  - testpath<0.4
   - tornado
   - toolz
   - xgboost
diff --git a/ci/install-circle.sh b/ci/install-circle.sh
index 1bc54dd6a..c334290a6 100755
--- a/ci/install-circle.sh
+++ b/ci/install-circle.sh
@@ -8,5 +8,6 @@ conda config --add channels conda-forge
 conda env create -f ci/environment-${PYTHON}.yml --name=${ENV_NAME} --quiet
 conda env list
 source activate ${ENV_NAME}
+pip install pip --upgrade
 pip install --no-deps --quiet -e .
 conda list -n ${ENV_NAME}
diff --git a/dask_ml/cluster/spectral.py b/dask_ml/cluster/spectral.py
index 04c3cccda..51544e3d8 100644
--- a/dask_ml/cluster/spectral.py
+++ b/dask_ml/cluster/spectral.py
@@ -142,7 +142,7 @@ def __init__(
         eigen_solver=None,
         random_state=None,
         n_init=10,
-        gamma=1.,
+        gamma=1.0,
         affinity="rbf",
         n_neighbors=10,
         eigen_tol=0.0,
diff --git a/dask_ml/decomposition/pca.py b/dask_ml/decomposition/pca.py
index de199893b..acd4d816f 100644
--- a/dask_ml/decomposition/pca.py
+++ b/dask_ml/decomposition/pca.py
@@ -211,7 +211,7 @@ def _fit(self, X):
             # Small problem, just call full PCA
             if max(X.shape) <= 500:
                 solver = "full"
-            elif n_components >= 1 and n_components < .8 * min(X.shape):
+            elif n_components >= 1 and n_components < 0.8 * min(X.shape):
                 solver = "randomized"
             # This is also the case of n_components in (0,1)
             else:
@@ -281,7 +281,7 @@ def _fit(self, X):
             else:
                 noise_variance = explained_variance[n_components:].mean()
         else:
-            noise_variance = 0.
+            noise_variance = 0.0
 
         (
             self.n_samples_,
@@ -427,8 +427,8 @@ def score_samples(self, X):
         Xr = X - self.mean_
         n_features = X.shape[1]
         precision = self.get_precision()  # [n_features, n_features]
-        log_like = -.5 * (Xr * (da.dot(Xr, precision))).sum(axis=1)
-        log_like -= .5 * (n_features * da.log(2. * np.pi) - fast_logdet(precision))
+        log_like = -0.5 * (Xr * (da.dot(Xr, precision))).sum(axis=1)
+        log_like -= 0.5 * (n_features * da.log(2.0 * np.pi) - fast_logdet(precision))
         return log_like
 
     def score(self, X, y=None):
diff --git a/dask_ml/decomposition/truncated_svd.py b/dask_ml/decomposition/truncated_svd.py
index 404b8bff0..b7b5c6daf 100644
--- a/dask_ml/decomposition/truncated_svd.py
+++ b/dask_ml/decomposition/truncated_svd.py
@@ -7,7 +7,7 @@
 
 class TruncatedSVD(BaseEstimator, TransformerMixin):
     def __init__(
-        self, n_components=2, algorithm="tsqr", n_iter=5, random_state=None, tol=0.
+        self, n_components=2, algorithm="tsqr", n_iter=5, random_state=None, tol=0.0
     ):
         """Dimensionality reduction using truncated SVD (aka LSA).
 
diff --git a/dask_ml/linear_model/glm.py b/dask_ml/linear_model/glm.py
index 22d67a3bb..679839fb5 100644
--- a/dask_ml/linear_model/glm.py
+++ b/dask_ml/linear_model/glm.py
@@ -228,7 +228,7 @@ def predict(self, X):
         C : array, shape = [n_samples,]
             Predicted class labels for each sample
         """
-        return self.predict_proba(X) > .5  # TODO: verify, multiclass broken
+        return self.predict_proba(X) > 0.5  # TODO: verify, multiclass broken
 
     def predict_proba(self, X):
         """Probability estimates for samples in X.
diff --git a/dask_ml/metrics/regression.py b/dask_ml/metrics/regression.py
index 84ca83207..c7f7e0fc2 100644
--- a/dask_ml/metrics/regression.py
+++ b/dask_ml/metrics/regression.py
@@ -84,7 +84,7 @@ def r2_score(
         output_scores[valid_score] = 1 - (
             numerator[valid_score] / denominator[valid_score]
         )
-        output_scores[nonzero_numerator & ~nonzero_denominator] = 0.
+        output_scores[nonzero_numerator & ~nonzero_denominator] = 0.0
 
     result = output_scores.mean(axis=0)
     if compute:
diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py
index 81ae02b32..b271e1898 100644
--- a/dask_ml/model_selection/utils_test.py
+++ b/dask_ml/model_selection/utils_test.py
@@ -29,9 +29,9 @@ def transform(self, X):
 
     def score(self, X=None, Y=None):
         if self.foo_param > 1:
-            score = 1.
+            score = 1.0
         else:
-            score = 0.
+            score = 0.0
         return score
 
     def get_params(self, deep=False):
@@ -184,7 +184,7 @@ def predict(self, T):
 
     def score(self, X=None, Y=None):
         if self.foo_param > 1:
-            score = 1.
+            score = 1.0
         else:
-            score = 0.
+            score = 0.0
         return score
diff --git a/dask_ml/naive_bayes.py b/dask_ml/naive_bayes.py
index 59577016a..01fe6a3db 100644
--- a/dask_ml/naive_bayes.py
+++ b/dask_ml/naive_bayes.py
@@ -112,7 +112,7 @@ def _joint_log_likelihood(self, X):
         jll = []
         for i in range(np.size(self.classes_)):
             jointi = da.log(self.class_prior_[i])
-            n_ij = -0.5 * da.sum(da.log(2. * np.pi * self.sigma_[i, :]))
+            n_ij = -0.5 * da.sum(da.log(2.0 * np.pi * self.sigma_[i, :]))
             n_ij -= 0.5 * da.sum(
                 ((X - self.theta_[i, :]) ** 2) / (self.sigma_[i, :]), 1
             )
diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py
index 53c282b2b..a936fedcb 100644
--- a/dask_ml/preprocessing/data.py
+++ b/dask_ml/preprocessing/data.py
@@ -156,7 +156,7 @@ def fit(self, X, y=None):
                 ]
             )
 
-        quantiles = [da.percentile(col, [q_min, 50., q_max]) for col in X.T]
+        quantiles = [da.percentile(col, [q_min, 50.0, q_max]) for col in X.T]
         quantiles = da.vstack(quantiles).compute()
         self.center_ = quantiles[:, 1]
         self.scale_ = quantiles[:, 2] - quantiles[:, 0]
diff --git a/docs/source/preprocessing.rst b/docs/source/preprocessing.rst
index 5f18e7ead..985b9fe72 100644
--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -167,7 +167,7 @@ In this toy example, we use a dataset with two columns. ``'A'`` is numeric and
    pipe = make_pipeline(
       Categorizer(),
       DummyEncoder(),
-      LogisticRegression()
+      LogisticRegression(solver='lbfgs')
    )
    pipe.fit(X, y)
 
diff --git a/setup.cfg b/setup.cfg
index 055523821..c87964caf 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,5 +30,3 @@ source=dask_ml
 addopts = -rsx -v --durations=10
 minversion = 3.2
 xfail_strict = true
-filterwarnings =
-    error:::sklearn[.*]
diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py
index 1eaf37b53..1983325ba 100644
--- a/tests/model_selection/dask_searchcv/test_model_selection.py
+++ b/tests/model_selection/dask_searchcv/test_model_selection.py
@@ -84,9 +84,9 @@ def _start(self, dsk):
 def test_visualize():
     pytest.importorskip("graphviz")
 
-    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, random_state=0)
+    X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0)
     clf = SVC(random_state=0, gamma="auto")
-    grid = {"C": [.1, .5, .9]}
+    grid = {"C": [0.1, 0.5, 0.9]}
     gs = dcv.GridSearchCV(clf, grid).fit(X, y)
 
     assert hasattr(gs, "dask_graph_")
diff --git a/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py b/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py
index 9a7ffddab..fa8ca67eb 100644
--- a/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py
+++ b/tests/model_selection/dask_searchcv/test_model_selection_sklearn.py
@@ -123,7 +123,7 @@ def test_grid_search_no_score():
     # Test grid-search on classifier that has no score function.
     clf = LinearSVC(random_state=0)
     X, y = make_blobs(random_state=0, centers=2)
-    Cs = [.1, 1, 10]
+    Cs = [0.1, 1, 10]
     clf_no_score = LinearSVCNoScore(random_state=0)
 
     # XXX: It seems there's some global shared state in LinearSVC - fitting
@@ -152,9 +152,9 @@ def test_grid_search_no_score():
 
 
 def test_grid_search_score_method():
-    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, random_state=0)
+    X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0)
     clf = LinearSVC(random_state=0)
-    grid = {"C": [.1]}
+    grid = {"C": [0.1]}
 
     search_no_scoring = dcv.GridSearchCV(clf, grid, scoring=None).fit(X, y)
     search_accuracy = dcv.GridSearchCV(clf, grid, scoring="accuracy").fit(X, y)
@@ -260,7 +260,7 @@ def test_classes__property():
     # Test that classes_ property matches best_estimator_.classes_
     X = np.arange(100).reshape(10, 10)
     y = np.array([0] * 5 + [1] * 5)
-    Cs = [.1, 1, 10]
+    Cs = [0.1, 1, 10]
 
     grid_search = dcv.GridSearchCV(LinearSVC(random_state=0), {"C": Cs})
     grid_search.fit(X, y)
@@ -418,7 +418,7 @@ def test_grid_search_sparse():
     y_pred2 = cv.predict(X_[180:])
     C2 = cv.best_estimator_.C
 
-    assert np.mean(y_pred == y_pred2) >= .9
+    assert np.mean(y_pred == y_pred2) >= 0.9
     assert C == C2
 
 
@@ -611,14 +611,16 @@ def test_gridsearch_no_predict():
     # test grid-search with an estimator without predict.
     # slight duplication of a test from KDE
     def custom_scoring(estimator, X):
-        return 42 if estimator.bandwidth == .1 else 0
+        return 42 if estimator.bandwidth == 0.1 else 0
 
-    X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
+    X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
     search = dcv.GridSearchCV(
-        KernelDensity(), param_grid=dict(bandwidth=[.01, .1, 1]), scoring=custom_scoring
+        KernelDensity(),
+        param_grid=dict(bandwidth=[0.01, 0.1, 1]),
+        scoring=custom_scoring,
     )
     search.fit(X)
-    assert search.best_params_["bandwidth"] == .1
+    assert search.best_params_["bandwidth"] == 0.1
     assert search.best_score_ == 42
 
 
@@ -852,15 +854,15 @@ def test_search_iid_param():
 
         # Test the first candidate
         assert search.cv_results_["param_C"][0] == 1
-        assert_array_almost_equal(test_cv_scores, [1, 1. / 3.])
+        assert_array_almost_equal(test_cv_scores, [1, 1.0 / 3.0])
         assert_array_almost_equal(train_cv_scores, [1, 1])
 
         # for first split, 1/4 of dataset is in test, for second 3/4.
         # take weighted average and weighted std
-        expected_test_mean = 1 * 1. / 4. + 1. / 3. * 3. / 4.
+        expected_test_mean = 1 * 1.0 / 4.0 + 1.0 / 3.0 * 3.0 / 4.0
         expected_test_std = np.sqrt(
-            1. / 4 * (expected_test_mean - 1) ** 2
-            + 3. / 4 * (expected_test_mean - 1. / 3.) ** 2
+            1.0 / 4 * (expected_test_mean - 1) ** 2
+            + 3.0 / 4 * (expected_test_mean - 1.0 / 3.0) ** 2
         )
         assert_almost_equal(test_mean, expected_test_mean)
         assert_almost_equal(test_std, expected_test_std)
@@ -911,7 +913,7 @@ def test_search_iid_param():
 
         assert search.cv_results_["param_C"][0] == 1
         # scores are the same as above
-        assert_array_almost_equal(test_cv_scores, [1, 1. / 3.])
+        assert_array_almost_equal(test_cv_scores, [1, 1.0 / 3.0])
         # Unweighted mean/std is used
         assert_almost_equal(test_mean, np.mean(test_cv_scores))
         assert_almost_equal(test_std, np.std(test_cv_scores))
@@ -984,7 +986,7 @@ def test_grid_search_correct_score_results():
     n_splits = 3
     clf = LinearSVC(random_state=0)
     X, y = make_blobs(random_state=0, centers=2)
-    Cs = [.1, 1, 10]
+    Cs = [0.1, 1, 10]
     for score in ["f1", "roc_auc"]:
         # XXX: It seems there's some global shared state in LinearSVC - fitting
         # multiple `SVC` instances in parallel using threads sometimes results
diff --git a/tests/model_selection/test_incremental.py b/tests/model_selection/test_incremental.py
index 5f9020075..68563ce34 100644
--- a/tests/model_selection/test_incremental.py
+++ b/tests/model_selection/test_incremental.py
@@ -131,7 +131,7 @@ def test_partial_fit_doesnt_mutate_inputs():
 def test_explicit(c, s, a, b):
     X, y = make_classification(n_samples=1000, n_features=10, chunks=(200, 10))
     model = SGDClassifier(tol=1e-3, penalty="elasticnet")
-    params = [{"alpha": .1}, {"alpha": .2}]
+    params = [{"alpha": 0.1}, {"alpha": 0.2}]
 
     def additional_calls(scores):
         """ Progress through predefined updates, checking along the way """
diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
index 19385d092..de5bacfa6 100644
--- a/tests/preprocessing/test_data.py
+++ b/tests/preprocessing/test_data.py
@@ -216,7 +216,7 @@ def test_basic(self):
         X = rs.uniform(size=(100, 3), chunks=50)
         a.fit(X)
         b.fit(X)
-        assert_estimator_equal(a, b, atol=.02)
+        assert_estimator_equal(a, b, atol=0.02)
 
         # set the quantiles, so that from here out, we're exact
         a.quantiles_ = b.quantiles_
diff --git a/tests/test_pca.py b/tests/test_pca.py
index 25f0b472a..50aa0dc73 100644
--- a/tests/test_pca.py
+++ b/tests/test_pca.py
@@ -316,7 +316,7 @@ def test_pca_check_projection():
     # Test that the projection of data is correct
     rng = np.random.RandomState(0)
     n, p = 100, 3
-    X = rng.randn(n, p) * .1
+    X = rng.randn(n, p) * 0.1
     X[:10] += np.array([3, 4, 5])
     Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
     dX = da.from_array(X, chunks=(n, p))
@@ -326,7 +326,7 @@ def test_pca_check_projection():
         Yt = dd.PCA(n_components=2, svd_solver=solver).fit(dX).transform(dXt)
         Yt /= np.sqrt((Yt ** 2).sum())
 
-        assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
+        assert_almost_equal(np.abs(Yt[0][0]), 1.0, 1)
 
 
 def test_pca_inverse():
@@ -334,7 +334,7 @@ def test_pca_inverse():
     rng = np.random.RandomState(0)
     n, p = 50, 3
     X = rng.randn(n, p)  # spherical data
-    X[:, 1] *= .00001  # make middle component relatively small
+    X[:, 1] *= 0.00001  # make middle component relatively small
     X += [5, 4, 3]  # make a large mean
     dX = da.from_array(X, chunks=(n // 2, p))
 
@@ -419,7 +419,7 @@ def test_randomized_pca_check_projection():
     # Test that the projection by randomized PCA on dense data is correct
     rng = np.random.RandomState(0)
     n, p = 100, 3
-    X = rng.randn(n, p) * .1
+    X = rng.randn(n, p) * 0.1
     X[:10] += np.array([3, 4, 5])
     Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
     X = da.from_array(X, chunks=(n, p))
@@ -432,7 +432,7 @@ def test_randomized_pca_check_projection():
     )
     Yt /= np.sqrt((Yt ** 2).sum())
 
-    assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
+    assert_almost_equal(np.abs(Yt[0][0]), 1.0, 1)
 
 
 @pytest.mark.xfail(reason="chunks")
@@ -454,7 +454,7 @@ def test_randomized_pca_inverse():
     rng = np.random.RandomState(0)
     n, p = 50, 3
     X = rng.randn(n, p)  # spherical data
-    X[:, 1] *= .00001  # make middle component relatively small
+    X[:, 1] *= 0.00001  # make middle component relatively small
     X += [5, 4, 3]  # make a large mean
     dX = da.from_array(X, chunks=(n, p))
 
@@ -480,7 +480,7 @@ def test_pca_dim():
     # Check automated dimensionality setting
     rng = np.random.RandomState(0)
     n, p = 100, 5
-    X = rng.randn(n, p) * .1
+    X = rng.randn(n, p) * 0.1
     X[:10] += np.array([3, 4, 5, 1, 2])
     pca = dd.PCA(n_components="mle", svd_solver="full").fit(X)
     assert_equal(pca.n_components, "mle")
@@ -493,7 +493,7 @@ def test_infer_dim_1():
     n, p = 1000, 5
     rng = np.random.RandomState(0)
     X = (
-        rng.randn(n, p) * .1
+        rng.randn(n, p) * 0.1
         + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2])
         + np.array([1, 0, 7, 4, 6])
     )
@@ -505,7 +505,7 @@ def test_infer_dim_1():
     for k in range(p):
         ll.append(_assess_dimension_(spect, k, n, p))
     ll = np.array(ll)
-    assert_greater(ll[1], ll.max() - .01 * n)
+    assert_greater(ll[1], ll.max() - 0.01 * n)
 
 
 def test_infer_dim_2():
@@ -513,7 +513,7 @@ def test_infer_dim_2():
     # Or at least use explicit variable names...
     n, p = 1000, 5
     rng = np.random.RandomState(0)
-    X = rng.randn(n, p) * .1
+    X = rng.randn(n, p) * 0.1
     X[:10] += np.array([3, 4, 5, 1, 2])
     X[10:20] += np.array([6, 0, 7, 2, -1])
     dX = da.from_array(X, chunks=(n, p))
@@ -526,7 +526,7 @@ def test_infer_dim_2():
 def test_infer_dim_3():
     n, p = 100, 5
     rng = np.random.RandomState(0)
-    X = rng.randn(n, p) * .1
+    X = rng.randn(n, p) * 0.1
     X[:10] += np.array([3, 4, 5, 1, 2])
     X[10:20] += np.array([6, 0, 7, 2, -1])
     X[30:40] += 2 * np.array([-1, 1, -1, 1, -1])
@@ -554,7 +554,7 @@ def test_infer_dim_by_explained_variance():
     rng = np.random.RandomState(0)
     # more features than samples
     X = rng.rand(5, 20)
-    pca = dd.PCA(n_components=.5, svd_solver="full").fit(X)
+    pca = dd.PCA(n_components=0.5, svd_solver="full").fit(X)
     assert_equal(pca.n_components, 0.5)
     assert_equal(pca.n_components_, 2)
 
@@ -563,7 +563,7 @@ def test_pca_score():
     # Test that probabilistic PCA scoring yields a reasonable score
     n, p = 1000, 3
     rng = np.random.RandomState(0)
-    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
+    X = rng.randn(n, p) * 0.1 + np.array([3, 4, 5])
     dX = da.from_array(X, chunks=(n // 2, p))
     for solver in solver_list:
         pca = dd.PCA(n_components=2, svd_solver=solver)
@@ -577,13 +577,13 @@ def test_pca_score2():
     # Test that probabilistic PCA correctly separated different datasets
     n, p = 100, 3
     rng = np.random.RandomState(0)
-    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
+    X = rng.randn(n, p) * 0.1 + np.array([3, 4, 5])
     dX = da.from_array(X, chunks=(n // 2, p))
     for solver in solver_list:
         pca = dd.PCA(n_components=2, svd_solver=solver)
         pca.fit(dX)
         ll1 = pca.score(dX)
-        ll2 = pca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5]))
+        ll2 = pca.score(rng.randn(n, p) * 0.2 + np.array([3, 4, 5]))
         assert_greater(ll1, ll2)
 
         # Test that it gives different scores if whiten=True
@@ -644,7 +644,7 @@ def test_pca_zero_noise_variance_edge_cases():
     n, p = 100, 3
 
     rng = np.random.RandomState(0)
-    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
+    X = rng.randn(n, p) * 0.1 + np.array([3, 4, 5])
     dX = da.from_array(X, chunks=(n, p))
     # arpack raises ValueError for n_components == min(n_samples,
     # n_features)