From c85ab083919b59ce84c220d5baf7d34ff4a0bcf2 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Tue, 29 May 2018 11:41:27 +0100 Subject: [PATCH] BUG: set keyword argument so zipfile actually compresses (#21144) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/io/common.py | 8 ++++---- pandas/tests/test_common.py | 21 ++++++++++++++++++++- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index fc6f3f3bfa614..35484e34ee9eb 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -86,6 +86,7 @@ Indexing I/O ^^^ +- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - diff --git a/pandas/io/common.py b/pandas/io/common.py index 0827216975f15..a492b7c0b8e8e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,7 @@ import codecs import mmap from contextlib import contextmanager, closing -from zipfile import ZipFile +import zipfile from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat @@ -428,7 +428,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, return f, handles -class BytesZipFile(ZipFile, BytesIO): +class BytesZipFile(zipfile.ZipFile, BytesIO): """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. @@ -437,10 +437,10 @@ class BytesZipFile(ZipFile, BytesIO): bytes strings into a member of the archive. """ # GH 17778 - def __init__(self, file, mode='r', **kwargs): + def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') - super(BytesZipFile, self).__init__(file, mode, **kwargs) + super(BytesZipFile, self).__init__(file, mode, compression, **kwargs) def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 0b329f64dafa3..bb7ee1b911fee 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- import pytest +import os import collections from functools import partial import numpy as np -from pandas import Series, Timestamp +from pandas import Series, DataFrame, Timestamp from pandas.compat import range, lmap import pandas.core.common as com from pandas.core import ops @@ -222,3 +223,21 @@ def test_standardize_mapping(): dd = collections.defaultdict(list) assert isinstance(com.standardize_mapping(dd), partial) + + +@pytest.mark.parametrize('obj', [ + DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) +@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) +def test_compression_size(obj, method, compression): + if not compression: + pytest.skip("only test compression case.") + + with tm.ensure_clean() as filename: + getattr(obj, method)(filename, compression=compression) + compressed = os.path.getsize(filename) + getattr(obj, method)(filename, compression=None) + uncompressed = os.path.getsize(filename) + assert uncompressed > compressed