diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 944ce9b4fb1f6..27dfded808b95 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,2 +1,3 @@ custom: https://pandas.pydata.org/donate.html +github: [numfocus] tidelift: pypi/pandas diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml new file mode 100644 index 0000000000000..019ecfc484ca5 --- /dev/null +++ b/.github/workflows/assign.yml @@ -0,0 +1,15 @@ +name: Assign +on: + issue_comment: + types: created + +jobs: + one: + runs-on: ubuntu-latest + steps: + - name: + run: | + if [[ "${{ github.event.comment.body }}" == "take" ]]; then + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + fi diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000000..5aa31e0ed3ab0 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,103 @@ +name: CI + +on: + push: + branches: master + pull_request: + branches: master + +env: + ENV_FILE: environment.yml + # TODO: remove export PATH=... in each step once this works + # PATH: $HOME/miniconda3/bin:$PATH + +jobs: + checks: + name: Checks + runs-on: ubuntu-latest + steps: + + - name: Checkout + uses: actions/checkout@v1 + + - name: Looking for unwanted patterns + run: ci/code_checks.sh patterns + if: true + + - name: Setup environment and build pandas + run: | + export PATH=$HOME/miniconda3/bin:$PATH + ci/setup_env.sh + if: true + + - name: Linting + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh lint + if: true + + - name: Dependencies consistency + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh dependencies + if: true + + - name: Checks on imported code + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh code + if: true + + - name: Running doctests + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh doctests + if: true + + - name: Docstring validation + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh docstrings + if: true + + - name: Typing validation + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh typing + if: true + + - name: Testing docstring validation script + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + pytest --capture=no --strict scripts + if: true + + - name: Running benchmarks + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + cd asv_bench + asv check -E existing + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream + if git diff upstream/master --name-only | grep -q "^asv_bench/"; then + asv machine --yes + ASV_OUTPUT="$(asv dev)" + if [[ $(echo "$ASV_OUTPUT" | grep "failed") ]]; then + echo "##vso[task.logissue type=error]Benchmarks run with errors" + echo "$ASV_OUTPUT" + exit 1 + else + echo "Benchmarks run without errors" + fi + else + echo "Benchmarks did not run, no changes detected" + fi + if: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3f98273a336cf..b34f5dfdd1a83 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/python/black - rev: stable + rev: 19.10b0 hooks: - id: black language_version: python3.7 @@ -9,7 +9,7 @@ repos: hooks: - id: flake8 language: python_venv - additional_dependencies: [flake8-comprehensions] + additional_dependencies: [flake8-comprehensions>=3.1.0] - repo: https://github.com/pre-commit/mirrors-isort rev: v4.3.20 hooks: diff --git a/.travis.yml b/.travis.yml index 398dd07089ef9..0acd386eea9ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,11 +30,9 @@ matrix: - python: 3.5 include: - - dist: bionic - # 18.04 - python: 3.8.0 + - dist: trusty env: - - JOB="3.8-dev" PATTERN="(not slow and not network)" + - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network)" - dist: trusty env: @@ -85,19 +83,10 @@ install: - ci/submit_cython_cache.sh - echo "install done" - -before_script: - # display server (for clipboard functionality) needs to be started here, - # does not work if done in install:setup_env.sh (GH-26103) - - export DISPLAY=":99.0" - - echo "sh -e /etc/init.d/xvfb start" - - if [ "$JOB" != "3.8-dev" ]; then sh -e /etc/init.d/xvfb start; fi - - sleep 3 - script: - echo "script start" - echo "$JOB" - - if [ "$JOB" != "3.8-dev" ]; then source activate pandas-dev; fi + - source activate pandas-dev - ci/run_tests.sh after_script: diff --git a/Makefile b/Makefile index 27a2c3682de9c..f26689ab65ba5 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ lint-diff: git diff upstream/master --name-only -- "*.py" | xargs flake8 black: - black . --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)' + black . develop: build python -m pip install --no-build-isolation -e . diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 4384ccb7fa8b3..a299e688a13ed 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -84,7 +84,7 @@ class ValueCounts: def setup(self, dropna): n = 5 * 10 ** 5 - arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") def time_value_counts(self, dropna): @@ -102,7 +102,7 @@ def time_rendering(self): class SetCategories: def setup(self): n = 5 * 10 ** 5 - arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") def time_set_categories(self): @@ -112,7 +112,7 @@ def time_set_categories(self): class RemoveCategories: def setup(self): n = 5 * 10 ** 5 - arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") def time_remove_categories(self): @@ -164,9 +164,9 @@ def setup(self, dtype): np.random.seed(1234) n = 5 * 10 ** 5 sample_size = 100 - arr = [i for i in np.random.randint(0, n // 10, size=n)] + arr = list(np.random.randint(0, n // 10, size=n)) if dtype == "object": - arr = ["s{:04d}".format(i) for i in arr] + arr = [f"s{i:04d}" for i in arr] self.sample = np.random.choice(arr, sample_size) self.series = pd.Series(arr).astype("category") @@ -225,7 +225,7 @@ def setup(self, index): elif index == "non_monotonic": self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories) else: - raise ValueError("Invalid index param: {}".format(index)) + raise ValueError(f"Invalid index param: {index}") self.scalar = 10000 self.list = list(range(10000)) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 3944e0bc523d8..a949ffdced576 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -99,7 +99,7 @@ class FromLists: def setup(self): N = 1000 M = 100 - self.data = [[j for j in range(M)] for i in range(N)] + self.data = [list(range(M)) for i in range(N)] def time_frame_from_lists(self): self.df = DataFrame(self.data) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index d57492dd37268..860c6cc6192bb 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -37,7 +37,7 @@ def wrapper(fname): return wrapper -from .pandas_vb_common import BaseIO # noqa: E402 isort:skip +from .pandas_vb_common import BaseIO # isort:skip class ParallelGroupbyMethods: @@ -250,13 +250,11 @@ def setup(self, dtype): np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows) ), "object": DataFrame( - "foo", - index=range(rows), - columns=["object%03d".format(i) for i in range(5)], + "foo", index=range(rows), columns=["object%03d" for _ in range(5)] ), } - self.fname = "__test_{}__.csv".format(dtype) + self.fname = f"__test_{dtype}__.csv" df = data[dtype] df.to_csv(self.fname) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index a94960d494707..f1d5209ac65ef 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -146,7 +146,7 @@ class Indexing: def setup(self, dtype): N = 10 ** 6 - self.idx = getattr(tm, "make{}Index".format(dtype))(N) + self.idx = getattr(tm, f"make{dtype}Index")(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) self.sorted = self.idx.sort_values() diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9b8599b0a1b64..b8e8630e663ee 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -132,7 +132,7 @@ class ReadCSVConcatDatetimeBadDateValue(StringIORewind): param_names = ["bad_date_value"] def setup(self, bad_date_value): - self.StringIO_input = StringIO(("%s,\n" % bad_date_value) * 50000) + self.StringIO_input = StringIO((f"{bad_date_value},\n") * 50000) def time_read_csv(self, bad_date_value): read_csv( @@ -202,7 +202,7 @@ def setup(self, sep, thousands): data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) df = DataFrame(data) if thousands is not None: - fmt = ":{}".format(thousands) + fmt = f":{thousands}" fmt = "{" + fmt + "}" df = df.applymap(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep) @@ -231,7 +231,7 @@ def setup(self, sep, decimal, float_precision): floats = [ "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15) ] - rows = sep.join(["0{}".format(decimal) + "{}"] * 3) + "\n" + rows = sep.join([f"0{decimal}" + "{}"] * 3) + "\n" data = rows * 5 data = data.format(*floats) * 200 # 1000 x 3 strings csv self.StringIO_input = StringIO(data) @@ -309,9 +309,7 @@ class ReadCSVCachedParseDates(StringIORewind): param_names = ["do_cache"] def setup(self, do_cache): - data = ( - "\n".join("10/{}".format(year) for year in range(2000, 2100)) + "\n" - ) * 10 + data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache): @@ -336,7 +334,7 @@ class ReadCSVMemoryGrowth(BaseIO): def setup(self): with open(self.fname, "w") as f: for i in range(self.num_rows): - f.write("{i}\n".format(i=i)) + f.write(f"{i}\n") def mem_parser_chunks(self): # see gh-24805. diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index c97cf768e27d9..75d87140488e3 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -14,7 +14,7 @@ def _generate_dataframe(): C = 5 df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index b78dc63d17130..88c1a3dc48ea4 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -115,7 +115,7 @@ def setup(self, format): C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 5c1d39776b91c..8f037e94e0095 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -20,7 +20,7 @@ def setup(self, orient, index): } df = DataFrame( np.random.randn(N, 5), - columns=["float_{}".format(i) for i in range(5)], + columns=[f"float_{i}" for i in range(5)], index=indexes[index], ) df.to_json(self.fname, orient=orient) @@ -43,7 +43,7 @@ def setup(self, index): } df = DataFrame( np.random.randn(N, 5), - columns=["float_{}".format(i) for i in range(5)], + columns=[f"float_{i}" for i in range(5)], index=indexes[index], ) df.to_json(self.fname, orient="records", lines=True) diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index f5038602539ab..a5b8b81bed85b 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -15,7 +15,7 @@ def setup(self): C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 647e9d27dec9d..12620656dd2bf 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -13,7 +13,7 @@ def setup(self): C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index fe84c869717e3..6cc7f56ae3d65 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -19,7 +19,7 @@ def setup(self, connection): "sqlite": sqlite3.connect(":memory:"), } self.table_name = "test_type" - self.query_all = "SELECT * FROM {}".format(self.table_name) + self.query_all = f"SELECT * FROM {self.table_name}" self.con = con[connection] self.df = DataFrame( { @@ -58,7 +58,7 @@ def setup(self, connection, dtype): "sqlite": sqlite3.connect(":memory:"), } self.table_name = "test_type" - self.query_col = "SELECT {} FROM {}".format(dtype, self.table_name) + self.query_col = f"SELECT {dtype} FROM {self.table_name}" self.con = con[connection] self.df = DataFrame( { diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index 28829785d72e9..f3125f8598418 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -17,7 +17,7 @@ def setup(self, convert_dates): C = self.C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(self.N) @@ -47,7 +47,7 @@ def setup(self, convert_dates): for i in range(10): missing_data = np.random.randn(self.N) missing_data[missing_data < 0] = np.nan - self.df["missing_{0}".format(i)] = missing_data + self.df[f"missing_{i}"] = missing_data self.df.to_stata(self.fname, self.convert_dates) diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index a960f43f46acd..77ce1b2763bce 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -3,7 +3,7 @@ import pandas as pd try: - import pandas.tseries.holiday # noqa + import pandas.tseries.holiday except ImportError: pass diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 828134b80aa3d..37418d752f833 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -14,8 +14,8 @@ def setup(self): self.str_days = [] self.str_seconds = [] for i in self.ints: - self.str_days.append("{0} days".format(i)) - self.str_seconds.append("00:00:{0:02d}".format(i)) + self.str_days.append(f"{i} days") + self.str_seconds.append(f"00:00:{i:02d}") def time_convert_int(self): to_timedelta(self.ints, unit="s") @@ -34,7 +34,7 @@ class ToTimedeltaErrors: def setup(self, errors): ints = np.random.randint(0, 60, size=10000) - self.arr = ["{0} days".format(i) for i in ints] + self.arr = [f"{i} days" for i in ints] self.arr[-1] = "apple" def time_convert(self, errors): diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index d6afb263b447f..66960ca2c6c10 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -73,33 +73,16 @@ jobs: - task: PublishTestResults@2 inputs: - testResultsFiles: 'test-data-*.xml' + testResultsFiles: 'test-data.xml' testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} displayName: 'Publish test results' - powershell: | - $junitXml = "test-data-single.xml" - $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' - if ($matches[1] -eq 0) - { - Write-Host "No test failures in test-data-single" - } - else - { - # note that this will produce $LASTEXITCODE=1 - Write-Error "$($matches[1]) tests failed" - } - - $junitXmlMulti = "test-data-multiple.xml" - $(Get-Content $junitXmlMulti | Out-String) -match 'failures="(.*?)"' - if ($matches[1] -eq 0) - { - Write-Host "No test failures in test-data-multi" - } - else - { - # note that this will produce $LASTEXITCODE=1 - Write-Error "$($matches[1]) tests failed" + $(Get-Content "test-data.xml" | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) { + Write-Host "No test failures in test-data" + } else { + Write-Error "$($matches[1]) tests failed" # will produce $LASTEXITCODE=1 } displayName: 'Check for test failures' diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index dfa82819b9826..86807b4010988 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -11,10 +11,12 @@ jobs: py36_np15: ENV_FILE: ci/deps/azure-windows-36.yaml CONDA_PY: "36" + PATTERN: "not slow and not network" py37_np141: ENV_FILE: ci/deps/azure-windows-37.yaml CONDA_PY: "37" + PATTERN: "not slow and not network" steps: - powershell: | @@ -22,38 +24,32 @@ jobs: Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin" displayName: 'Add conda to PATH' - script: conda update -q -n base conda - displayName: Update conda - - script: | - call activate + displayName: 'Update conda' + - bash: | conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml displayName: 'Create anaconda environment' - - script: | - call activate pandas-dev - call conda list + - bash: | + source activate pandas-dev + conda list ci\\incremental\\build.cmd displayName: 'Build' - - script: | - call activate pandas-dev - pytest -m "not slow and not network" --junitxml=test-data.xml pandas -n 2 -r sxX --strict --durations=10 %* + - bash: | + source activate pandas-dev + ci/run_tests.sh displayName: 'Test' - task: PublishTestResults@2 inputs: testResultsFiles: 'test-data.xml' testRunTitle: 'Windows-$(CONDA_PY)' - powershell: | - $junitXml = "test-data.xml" - $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' - if ($matches[1] -eq 0) - { + $(Get-Content "test-data.xml" | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) { Write-Host "No test failures in test-data" - } - else - { - # note that this will produce $LASTEXITCODE=1 - Write-Error "$($matches[1]) tests failed" + } else { + Write-Error "$($matches[1]) tests failed" # will produce $LASTEXITCODE=1 } displayName: 'Check for test failures' - - script: | + - bash: | source activate pandas-dev python ci/print_skipped.py displayName: 'Print skipped tests' diff --git a/ci/build38.sh b/ci/build38.sh deleted file mode 100644 index 66eb5cad38475..0000000000000 --- a/ci/build38.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -e -# Special build for python3.8 until numpy puts its own wheels up - -sudo apt-get install build-essential gcc xvfb -pip install --no-deps -U pip wheel setuptools -pip install python-dateutil pytz cython pytest pytest-xdist hypothesis - -# Possible alternative for getting numpy: -pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy - -python setup.py build_ext -inplace -python -m pip install -v --no-build-isolation -e . - -python -c "import sys; print(sys.version_info)" -python -c "import pandas as pd" -python -c "import hypothesis" - -# TODO: Is there anything else in setup_env that we really want to do? -# ci/setup_env.sh diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 07c99b39e83e8..7c6c98d910492 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -56,7 +56,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then black --version MSG='Checking black formatting' ; echo $MSG - black . --check --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)' + black . --check RET=$(($RET + $?)) ; echo $MSG "DONE" # `setup.cfg` contains the list of error codes that are being ignored in flake8 @@ -190,6 +190,14 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.rst" ".. ipython ::" doc/source RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for extra blank lines after the class definition' ; echo $MSG + invgrep -R --include="*.py" --include="*.pyx" -E 'class.*:\n\n( )+"""' . + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for use of comment-based annotation syntax' ; echo $MSG + invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG set -o pipefail if [[ "$AZURE" == "true" ]]; then diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml index 1e2e6c33e8c15..cf3fca307481f 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-36-32bit.yaml @@ -3,21 +3,24 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + ### Cython 0.29.13 and pytest 5.0.1 for 32 bits are not available with conda, installing below with pip instead + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - attrs=19.1.0 - gcc_linux-32 - - gcc_linux-32 - gxx_linux-32 - numpy=1.14.* - python-dateutil - - python=3.6.* - pytz=2017.2 - # universal - - pytest - - pytest-xdist - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 + + # see comment above - pip - pip: - # Anaconda doesn't build a new enough Cython - cython>=0.29.13 + - pytest>=5.0.1 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 76868f598f11b..c3c94e365c259 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -3,28 +3,30 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - beautifulsoup4==4.6.0 - bottleneck=1.2.* - - cython=0.29.13 - lxml - matplotlib=2.2.2 - numpy=1.14.* - openpyxl=2.4.8 - python-dateutil - python-blosc - - python=3.6.* - pytz=2017.2 - scipy - sqlalchemy=1.1.4 - xlrd=1.1.0 - xlsxwriter=0.9.8 - xlwt=1.2.0 - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 - pip - pip: - html5lib==1.0b2 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 21205375204dc..46ddd44931848 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -3,8 +3,17 @@ channels: - defaults - conda-forge dependencies: - - beautifulsoup4 + - python=3.6.* + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - beautifulsoup4 - gcsfs - html5lib - ipython @@ -17,7 +26,6 @@ dependencies: - openpyxl - pytables - python-dateutil - - python=3.6.* - pytz - s3fs - scipy @@ -25,12 +33,4 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=4.0.2 - - pytest-xdist - - pytest-mock - - pytest-azurepipelines - moto - - pip - - pip: - - hypothesis>=3.58.0 diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index e2c78165fe4b9..ff1095005fa85 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -3,25 +3,27 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.1 + + # tools + - cython=0.29.13 + - pytest=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - beautifulsoup4=4.6.0 - bottleneck=1.2.1 - - cython>=0.29.13 - jinja2=2.8 - numexpr=2.6.2 - numpy=1.13.3 - openpyxl=2.4.8 - pytables=3.4.2 - python-dateutil=2.6.1 - - python=3.6.1 - pytz=2017.2 - scipy=0.19.0 - xlrd=1.1.0 - xlsxwriter=0.9.8 - xlwt=1.2.0 - # universal - html5lib=1.0.1 - - hypothesis>=3.58.0 - - pytest=4.5.0 - - pytest-xdist - - pytest-mock - - pytest-azurepipelines diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 24464adb74f5b..3319afed173b5 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -3,8 +3,17 @@ channels: - defaults - conda-forge dependencies: - - beautifulsoup4 + - python=3.7.* + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - beautifulsoup4 - html5lib - ipython - jinja2 @@ -17,7 +26,6 @@ dependencies: - openpyxl - pytables - python-dateutil - - python=3.7.* - pytz - s3fs - scipy @@ -25,11 +33,3 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - pip - - pip: - - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 0fb06fd43724c..a04bdc2448bce 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -3,14 +3,16 @@ channels: - defaults dependencies: - python=3.7.* - - pytz - - Cython>=0.29.13 - # universal - # pytest < 5 until defaults has pytest-xdist>=1.29.0 - - pytest>=4.0.2,<5.0 - - pytest-xdist - - pytest-mock + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - pytz - pip - pip: - "git+git://github.com/dateutil/dateutil.git" @@ -18,5 +20,3 @@ dependencies: - "--pre" - "numpy" - "scipy" - # https://github.com/pandas-dev/pandas/issues/27421 - - pytest-azurepipelines<1.0.0 diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 85c090bf6f938..831b68d0bb4d3 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -2,6 +2,16 @@ name: pandas-dev channels: - defaults dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - beautifulsoup4 - bottleneck - html5lib @@ -14,7 +24,6 @@ dependencies: - openpyxl - pyarrow - pytables - - python=3.6.* - python-dateutil==2.6.1 - pytz - xarray @@ -23,13 +32,4 @@ dependencies: - xlwt - pip - pip: - # Anaconda / conda-forge don't build for 3.5 - - cython>=0.29.13 - pyreadstat - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - hypothesis>=3.58.0 - # https://github.com/pandas-dev/pandas/issues/27421 - - pytest-azurepipelines<1.0.0 diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 88b38aaef237c..aa3962da9b4f0 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -3,9 +3,19 @@ channels: - conda-forge - defaults dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - blosc - bottleneck - - fastparquet>=0.2.1 + - fastparquet>=0.3.2 - matplotlib=3.0.2 - numexpr - numpy=1.15.* @@ -13,16 +23,8 @@ dependencies: - pyarrow - pytables - python-dateutil - - python=3.6.* - pytz - scipy - xlrd - xlsxwriter - xlwt - # universal - - cython>=0.29.13 - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 7680ed9fd9c92..928896efd5fc4 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -3,6 +3,16 @@ channels: - defaults - conda-forge dependencies: + - python=3.7.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - beautifulsoup4 - bottleneck - gcsfs @@ -15,7 +25,6 @@ dependencies: - numpy=1.14.* - openpyxl - pytables - - python=3.7.* - python-dateutil - pytz - s3fs @@ -24,11 +33,4 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - cython>=0.29.13 - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 - pyreadstat diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index e4e917d13990c..ddc1ea41a08a3 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -3,11 +3,21 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-cov # this is only needed in the coverage build + + # pandas dependencies - beautifulsoup4 - botocore>=1.11 - cython>=0.29.13 - dask - - fastparquet>=0.2.1 + - fastparquet>=0.3.2 - gcsfs - geopandas - html5lib @@ -27,9 +37,8 @@ dependencies: - pymysql - pytables - python-snappy - - python=3.6.* - pytz - - s3fs<0.3 + - s3fs - scikit-learn - scipy - sqlalchemy @@ -38,12 +47,6 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-cov - - pytest-mock - - hypothesis>=3.58.0 - pip - pip: - brotlipy diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 44795766d7c31..d0bc046575953 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -3,11 +3,19 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies - beautifulsoup4 - blosc=1.14.3 - python-blosc - - cython>=0.29.13 - - fastparquet=0.2.1 + - fastparquet=0.3.2 - gcsfs=0.2.2 - html5lib - ipython @@ -24,19 +32,11 @@ dependencies: - pymysql=0.7.11 - pytables - python-dateutil - - python=3.6.* - pytz - - s3fs=0.0.8 + - s3fs=0.3.0 - scipy - sqlalchemy=1.1.4 - xarray=0.10 - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - pip - - pip: - - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index d54708d48a65e..1dfd90d0904ac 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -3,8 +3,16 @@ channels: - defaults - conda-forge dependencies: - - beautifulsoup4 + - python=3.6.* + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies + - beautifulsoup4 - html5lib - lxml - matplotlib @@ -16,17 +24,11 @@ dependencies: - pymysql - pytables - python-dateutil - - python=3.6.* - pytz - - s3fs<0.3 + - s3fs - scipy - sqlalchemy - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - moto - - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 440ca6c480b87..6826a9d072ff3 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -5,20 +5,22 @@ channels: - c3i_test dependencies: - python=3.7.* - - botocore>=1.11 + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies + - botocore>=1.11 - numpy - python-dateutil - nomkl - pyarrow - pytz - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - - hypothesis>=3.58.0 - - s3fs<0.3 - - pip + - s3fs - pyreadstat + - pip - pip: - moto diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml new file mode 100644 index 0000000000000..828f02596a70e --- /dev/null +++ b/ci/deps/travis-38.yaml @@ -0,0 +1,19 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - python=3.8.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies + - numpy + - python-dateutil + - nomkl + - pytz + - pip diff --git a/ci/print_skipped.py b/ci/print_skipped.py index e99e789a71fe8..51a2460e05fab 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -27,14 +27,13 @@ def main(filename): if __name__ == "__main__": print("SKIPPED TESTS:") i = 1 - for file_type in ("-single", "-multiple", ""): - for test_data in main("test-data{}.xml".format(file_type)): - if test_data is None: - print("-" * 80) - else: - print( - "#{i} {class_name}.{test_name}: {message}".format( - **dict(test_data, i=i) - ) + for test_data in main("test-data.xml"): + if test_data is None: + print("-" * 80) + else: + print( + "#{i} {class_name}.{test_name}: {message}".format( + **dict(test_data, i=i) ) - i += 1 + ) + i += 1 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index d1a9447c97d4e..b91cfb3bed8cc 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -15,37 +15,29 @@ if [ -n "$LOCALE_OVERRIDE" ]; then # exit 1 fi fi + if [[ "not network" == *"$PATTERN"* ]]; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi - -if [ -n "$PATTERN" ]; then - PATTERN=" and $PATTERN" +if [ "$COVERAGE" ]; then + COVERAGE_FNAME="/tmp/test_coverage.xml" + COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" fi -for TYPE in single multiple -do - if [ "$COVERAGE" ]; then - COVERAGE_FNAME="/tmp/coc-$TYPE.xml" - COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" - fi +PYTEST_CMD="pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" - TYPE_PATTERN=$TYPE - NUM_JOBS=1 - if [[ "$TYPE_PATTERN" == "multiple" ]]; then - TYPE_PATTERN="not single" - NUM_JOBS=2 - fi +# Travis does not have have an X server +if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + DISPLAY=DISPLAY=:99.0 + PYTEST_CMD="xvfb-run -e /dev/stdout $PYTEST_CMD" +fi - PYTEST_CMD="pytest -m \"$TYPE_PATTERN$PATTERN\" -n $NUM_JOBS -s --strict --durations=10 --junitxml=test-data-$TYPE.xml $TEST_ARGS $COVERAGE pandas" - echo $PYTEST_CMD - # if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code - sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret" +echo $PYTEST_CMD +sh -c "$PYTEST_CMD" - if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then - echo "uploading coverage for $TYPE tests" - echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" - bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME - fi -done +if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then + echo "uploading coverage" + echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" + bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME +fi diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 4d454f9c5041a..3d79c0cfd7000 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,10 +1,5 @@ #!/bin/bash -e -if [ "$JOB" == "3.8-dev" ]; then - /bin/bash ci/build38.sh - exit 0 -fi - # edit the locale file if needed if [ -n "$LOCALE_OVERRIDE" ]; then echo "Adding locale to the first line of pandas/__init__.py" @@ -114,6 +109,11 @@ echo "w/o removing anything else" conda remove pandas -y --force || true pip uninstall -y pandas || true +echo +echo "remove postgres if has been installed with conda" +echo "we use the one from the CI" +conda remove postgresql -y --force || true + echo echo "conda list pandas" conda list pandas diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 8fe5b174c77d3..d7b3e159f8ce7 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -24,6 +24,27 @@ and `good first issue where you could start out. Once you've found an interesting issue, you can return here to get your development environment setup. +When you start working on an issue, it's a good idea to assign the issue to yourself, +so nobody else duplicates the work on it. GitHub restricts assigning issues to maintainers +of the project only. In most projects, and until recently in pandas, contributors added a +comment letting others know they are working on an issue. While this is ok, you need to +check each issue individually, and it's not possible to find the unassigned ones. + +For this reason, we implemented a workaround consisting of adding a comment with the exact +text `take`. When you do it, a GitHub action will automatically assign you the issue +(this will take seconds, and may require refreshint the page to see it). +By doing this, it's possible to filter the list of issues and find only the unassigned ones. + +So, a good way to find an issue to start contributing to pandas is to check the list of +`unassigned good first issues `_ +and assign yourself one you like by writing a comment with the exact text `take`. + +If for whatever reason you are not able to continue working with the issue, please try to +unassign it, so other people know it's available again. You can check the list of +assigned issues, since people may not be working in them anymore. If you want to work on one +that is assigned, feel free to kindly ask the current assignee if you can take it +(please allow at least a week of inactivity before considering work in the issue discontinued). + Feel free to ask questions on the `mailing list `_ or on `Gitter`_. @@ -633,6 +654,9 @@ submitting code to run the check yourself:: to auto-format your code. Additionally, many editors have plugins that will apply ``black`` as you edit files. +You should use a ``black`` version >= 19.10b0 as previous versions are not compatible +with the pandas codebase. + Optionally, you may wish to setup `pre-commit hooks `_ to automatically run ``black`` and ``flake8`` when you make a git commit. This can be done by installing ``pre-commit``:: @@ -780,7 +804,7 @@ Types imports should follow the ``from typing import ...`` convention. So rather import typing - primes = [] # type: typing.List[int] + primes: typing.List[int] = [] You should write @@ -788,19 +812,19 @@ You should write from typing import List, Optional, Union - primes = [] # type: List[int] + primes: List[int] = [] ``Optional`` should be used where applicable, so instead of .. code-block:: python - maybe_primes = [] # type: List[Union[int, None]] + maybe_primes: List[Union[int, None]] = [] You should write .. code-block:: python - maybe_primes = [] # type: List[Optional[int]] + maybe_primes: List[Optional[int]] = [] In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like @@ -816,7 +840,7 @@ The appropriate way to annotate this would be as follows str_type = str class SomeClass2: - str = None # type: str_type + str: str_type = None In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example @@ -922,7 +946,7 @@ extensions in `numpy.testing .. note:: - The earliest supported pytest version is 4.0.2. + The earliest supported pytest version is 5.0.1. Writing tests ~~~~~~~~~~~~~ diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index e341dcb8318bc..89d43e8a43825 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -251,6 +251,48 @@ To use a test, subclass it: See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py for a list of all the tests available. +.. _extending.extension.arrow: + +Compatibility with Apache Arrow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +An ``ExtensionArray`` can support conversion to / from ``pyarrow`` arrays +(and thus support for example serialization to the Parquet file format) +by implementing two methods: ``ExtensionArray.__arrow_array__`` and +``ExtensionDtype.__from_arrow__``. + +The ``ExtensionArray.__arrow_array__`` ensures that ``pyarrow`` knowns how +to convert the specific extension array into a ``pyarrow.Array`` (also when +included as a column in a pandas DataFrame): + +.. code-block:: python + + class MyExtensionArray(ExtensionArray): + ... + + def __arrow_array__(self, type=None): + # convert the underlying array values to a pyarrow Array + import pyarrow + return pyarrow.array(..., type=type) + +The ``ExtensionDtype.__from_arrow__`` method then controls the conversion +back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow +``Array`` or ``ChunkedArray`` as only argument and is expected to return the +appropriate pandas ``ExtensionArray`` for this dtype and the passed values: + +.. code-block:: none + + class ExtensionDtype: + ... + + def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> ExtensionArray: + ... + +See more in the `Arrow documentation `__. + +Those methods have been implemented for the nullable integer and string extension +dtypes included in pandas, and ensure roundtrip to pyarrow and the Parquet file format. + .. _extension dtype dtypes: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/dtypes.py .. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py .. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index 1228f00667f3a..757b197c717e6 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -13,8 +13,10 @@ Development :maxdepth: 2 contributing + maintaining internals extending developer policies roadmap + meeting diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst new file mode 100644 index 0000000000000..0d1088cc8a6ca --- /dev/null +++ b/doc/source/development/maintaining.rst @@ -0,0 +1,193 @@ +.. _maintaining: + +****************** +Pandas Maintenance +****************** + +This guide is for pandas' maintainers. It may also be interesting to contributors +looking to understand the pandas development process and what steps are necessary +to become a maintainer. + +The main contributing guide is available at :ref:`contributing`. + +Roles +----- + +Pandas uses two levels of permissions: **triage** and **core** team members. + +Triage members can label and close issues and pull requests. + +Core team members can label and close issues and pull request, and can merge +pull requests. + +GitHub publishes the full `list of permissions`_. + +Tasks +----- + +Pandas is largely a volunteer project, so these tasks shouldn't be read as +"expectations" of triage and maintainers. Rather, they're general descriptions +of what it means to be a maintainer. + +* Triage newly filed issues (see :ref:`maintaining.triage`) +* Review newly opened pull requests +* Respond to updates on existing issues and pull requests +* Drive discussion and decisions on stalled issues and pull requests +* Provide experience / wisdom on API design questions to ensure consistency and maintainability +* Project organization (run / attend developer meetings, represent pandas) + +http://matthewrocklin.com/blog/2019/05/18/maintainer may be interesting background +reading. + +.. _maintaining.triage: + +Issue Triage +------------ + + +Here's a typical workflow for triaging a newly opened issue. + +1. **Thank the reporter for opening an issue** + + The issue tracker is many people's first interaction with the pandas project itself, + beyond just using the library. As such, we want it to be a welcoming, pleasant + experience. + +2. **Is the necessary information provided?** + + Ideally reporters would fill out the issue template, but many don't. + If crucial information (like the version of pandas they used), is missing + feel free to ask for that and label the issue with "Needs info". The + report should follow the guidelines in :ref:`contributing.bug_reports`. + You may want to link to that if they didn't follow the template. + + Make sure that the title accurately reflects the issue. Edit it yourself + if it's not clear. + +3. **Is this a duplicate issue?** + + We have many open issues. If a new issue is clearly a duplicate, label the + new issue as "Duplicate" assign the milestone "No Action", and close the issue + with a link to the original issue. Make sure to still thank the reporter, and + encourage them to chime in on the original issue, and perhaps try to fix it. + + If the new issue provides relevant information, such as a better or slightly + different example, add it to the original issue as a comment or an edit to + the original post. + +4. **Is the issue minimal and reproducible**? + + For bug reports, we ask that the reporter provide a minimal reproducible + example. See http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports + for a good explanation. If the example is not reproducible, or if it's + *clearly* not minimal, feel free to ask the reporter if they can provide + and example or simplify the provided one. Do acknowledge that writing + minimal reproducible examples is hard work. If the reporter is struggling, + you can try to write one yourself and we'll edit the original post to include it. + + If a reproducible example can't be provided, add the "Needs info" label. + + If a reproducible example is provided, but you see a simplification, + edit the original post with your simpler reproducible example. + +5. **Is this a clearly defined feature request?** + + Generally, pandas prefers to discuss and design new features in issues, before + a pull request is made. Encourage the submitter to include a proposed API + for the new feature. Having them write a full docstring is a good way to + pin down specifics. + + We'll need a discussion from several pandas maintainers before deciding whether + the proposal is in scope for pandas. + +6. **Is this a usage question?** + + We prefer that usage questions are asked on StackOverflow with the pandas + tag. https://stackoverflow.com/questions/tagged/pandas + + If it's easy to answer, feel free to link to the relevant documentation section, + let them know that in the future this kind of question should be on + StackOverflow, and close the issue. + +7. **What labels and milestones should I add?** + + Apply the relevant labels. This is a bit of an art, and comes with experience. + Look at similar issues to get a feel for how things are labeled. + + If the issue is clearly defined and the fix seems relatively straightforward, + label the issue as "Good first issue". + + Typically, new issues will be assigned the "Contributions welcome" milestone, + unless it's know that this issue should be addressed in a specific release (say + because it's a large regression). + +.. _maintaining.closing: + +Closing Issues +-------------- + +Be delicate here: many people interpret closing an issue as us saying that the +conversation is over. It's typically best to give the reporter some time to +respond or self-close their issue if it's determined that the behavior is not a bug, +or the feature is out of scope. Sometimes reporters just go away though, and +we'll close the issue after the conversation has died. + +Reviewing Pull Requests +----------------------- + +Anybody can review a pull request: regular contributors, triagers, or core-team +members. Here are some guidelines to check. + +* Tests should be in a sensible location. +* New public APIs should be included somewhere in ``doc/source/reference/``. +* New / changed API should use the ``versionadded`` or ``versionchanged`` directives in the docstring. +* User-facing changes should have a whatsnew in the appropriate file. +* Regression tests should reference the original GitHub issue number like ``# GH-1234``. + +Cleaning up old Issues +---------------------- + +Every open issue in pandas has a cost. Open issues make finding duplicates harder, +and can make it harder to know what needs to be done in pandas. That said, closing +issues isn't a goal on its own. Our goal is to make pandas the best it can be, +and that's best done by ensuring that the quality of our open issues is high. + +Occasionally, bugs are fixed but the issue isn't linked to in the Pull Request. +In these cases, comment that "This has been fixed, but could use a test." and +label the issue as "Good First Issue" and "Needs Test". + +If an older issue doesn't follow our issue template, edit the original post to +include a minimal example, the actual output, and the expected output. Uniformity +in issue reports is valuable. + +If an older issue lacks a reproducible example, label it as "Needs Info" and +ask them to provide one (or write one yourself if possible). If one isn't +provide reasonably soon, close it according to the policies in :ref:`maintaining.closing`. + +Cleaning up old Pull Requests +----------------------------- + +Occasionally, contributors are unable to finish off a pull request. +If some time has passed (two weeks, say) since the last review requesting changes, +gently ask if they're still interested in working on this. If another two weeks or +so passes with no response, thank them for their work and close the pull request. +Comment on the original issue that "There's a stalled PR at #1234 that may be +helpful.", and perhaps label the issue as "Good first issue" if the PR was relatively +close to being accepted. + +Additionally, core-team members can push to contributors branches. This can be +helpful for pushing an important PR across the line, or for fixing a small +merge conflict. + +Becoming a pandas maintainer +---------------------------- + +The full process is outlined in our `governance documents`_. In summary, +we're happy to give triage permissions to anyone who shows interest by +being helpful on the issue tracker. + +The current list of core-team members is at +https://github.com/pandas-dev/pandas-governance/blob/master/people.md + +.. _governance documents: https://github.com/pandas-dev/pandas-governance +.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization \ No newline at end of file diff --git a/doc/source/development/meeting.rst b/doc/source/development/meeting.rst new file mode 100644 index 0000000000000..1d19408692cda --- /dev/null +++ b/doc/source/development/meeting.rst @@ -0,0 +1,32 @@ +.. _meeting: + +================== +Developer Meetings +================== + +We hold regular developer meetings on the second Wednesday +of each month at 18:00 UTC. These meetings and their minutes are open to +the public. All are welcome to join. + +Minutes +------- + +The minutes of past meetings are available in `this Google Document `__. + +Calendar +-------- + +This calendar shows all the developer meetings. + +.. raw:: html + + + +You can subscribe to this calendar with the following links: + +* `iCal `__ +* `Google calendar `__ + +Additionally, we'll sometimes have one-off meetings on specific topics. +These will be published on the same calendar. + diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 125990f7cadcd..6301fee7775cf 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1950,6 +1950,7 @@ sparse :class:`SparseDtype` (none) :class:`arrays. intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` Strings :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text` +Boolean (with NA) :class:`BooleanDtype` :class:`bool` :class:`arrays.BooleanArray` :ref:`api.arrays.bool` =================== ========================= ================== ============================= ============================= Pandas has two ways to store strings. diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1ca1640f9a7c6..04df37427e4f5 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -177,7 +177,7 @@ pandas is equipped with an exhaustive set of unit tests, covering about 97% of the code base as of this writing. To run it on your machine to verify that everything is working (and that you have all of the dependencies, soft and hard, installed), make sure you have `pytest -`__ >= 4.0.2 and `Hypothesis +`__ >= 5.0.1 and `Hypothesis `__ >= 3.58, then run: :: @@ -250,7 +250,7 @@ SQLAlchemy 1.1.4 SQL support for databases other tha SciPy 0.19.0 Miscellaneous statistical functions XLsxWriter 0.9.8 Excel writing blosc Compression for msgpack -fastparquet 0.2.1 Parquet reading / writing +fastparquet 0.3.2 Parquet reading / writing gcsfs 0.2.2 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) lxml 3.8.0 HTML parser for read_html (see :ref:`note `) @@ -263,7 +263,7 @@ pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing qtpy Clipboard I/O -s3fs 0.0.8 Amazon S3 access +s3fs 0.3.0 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data xclip Clipboard I/O on linux xlrd 1.1.0 Excel reading diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 0c435e06ac57f..cf14d28772f4c 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -25,6 +25,7 @@ Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.array Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical` Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse` Strings :class:`StringDtype` :class:`str` :ref:`api.arrays.string` +Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` =================== ========================= ================== ============================= Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). @@ -485,6 +486,28 @@ The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arra See :ref:`api.series.str` for more. +.. _api.arrays.bool: + +Boolean data with missing values +-------------------------------- + +The boolean dtype (with the alias ``"boolean"``) provides support for storing +boolean data (True, False values) with missing values, which is not possible +with a bool :class:`numpy.ndarray`. + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + arrays.BooleanArray + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + BooleanDtype + + .. Dtype attributes which are manually listed in their docstrings: including .. it here to make sure a docstring page is built for them diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 4b5faed0f4d2d..4540504974f56 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -30,7 +30,6 @@ Attributes and underlying data DataFrame.dtypes DataFrame.ftypes DataFrame.get_dtype_counts - DataFrame.get_ftype_counts DataFrame.select_dtypes DataFrame.values DataFrame.get_values @@ -40,7 +39,6 @@ Attributes and underlying data DataFrame.shape DataFrame.memory_usage DataFrame.empty - DataFrame.is_copy Conversion ~~~~~~~~~~ @@ -142,8 +140,6 @@ Computations / descriptive stats DataFrame.all DataFrame.any DataFrame.clip - DataFrame.clip_lower - DataFrame.clip_upper DataFrame.compound DataFrame.corr DataFrame.corrwith @@ -351,7 +347,6 @@ Serialization / IO / conversion :toctree: api/ DataFrame.from_dict - DataFrame.from_items DataFrame.from_records DataFrame.info DataFrame.to_parquet diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 9c69770c0f1b7..0961acc43f301 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -97,13 +97,11 @@ Scalar introspection api.types.is_bool api.types.is_categorical api.types.is_complex - api.types.is_datetimetz api.types.is_float api.types.is_hashable api.types.is_integer api.types.is_interval api.types.is_number - api.types.is_period api.types.is_re api.types.is_re_compilable api.types.is_scalar diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 59910ba357130..c501e8bc91379 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -45,7 +45,6 @@ Attributes Series.dtypes Series.ftypes Series.data - Series.is_copy Series.name Series.put @@ -148,8 +147,6 @@ Computations / descriptive stats Series.autocorr Series.between Series.clip - Series.clip_lower - Series.clip_upper Series.corr Series.count Series.cov diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 3d155535e2585..24a47336b0522 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -41,6 +41,7 @@ Style application Styler.set_caption Styler.set_properties Styler.set_uuid + Styler.set_na_rep Styler.clear Styler.pipe diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index c6eadd2adadce..31bb71064d735 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -554,6 +554,31 @@ index. Both ``rename`` and ``rename_axis`` support specifying a dictionary, ``Series`` or a mapping function to map labels/names to new values. +When working with an ``Index`` object directly, rather than via a ``DataFrame``, +:meth:`Index.set_names` can be used to change the names. + +.. ipython:: python + + mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi.names + + mi2 = mi.rename("new name", level=0) + mi2 + +.. warning:: + + Prior to pandas 1.0.0, you could also set the names of a ``MultiIndex`` + by updating the name of a level. + + .. code-block:: none + + >>> mi.levels[0].name = 'name via level' + >>> mi.names[0] # only works for older panads + 'name via level' + + As of pandas 1.0, this will *silently* fail to update the names + of the MultiIndex. Use :meth:`Index.set_names` instead. + Sorting a ``MultiIndex`` ------------------------ diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 97b9c2f95dc50..f1f3d79eed61e 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -30,7 +30,7 @@ you must explicitly pass the dtype into :meth:`array` or :class:`Series`: .. ipython:: python - arr = pd.array([1, 2, np.nan], dtype=pd.Int64Dtype()) + arr = pd.array([1, 2, None], dtype=pd.Int64Dtype()) arr Or the string alias ``"Int64"`` (note the capital ``"I"``, to differentiate from @@ -63,7 +63,7 @@ up with a ``float64`` dtype Series: pd.Series([1, 2, np.nan]) Operations involving an integer array will behave similar to NumPy arrays. -Missing values will be propagated, and and the data will be coerced to another +Missing values will be propagated, and the data will be coerced to another dtype if needed. .. ipython:: python diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f8e174abfd193..fa47a5944f7bf 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4671,7 +4671,6 @@ See the `Full Documentation `__. Write to a feather file. .. ipython:: python - :okwarning: df.to_feather('example.feather') @@ -4717,6 +4716,9 @@ Several caveats. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. +* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data + type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols, + see the :ref:`extension types documentation `). You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, @@ -4748,7 +4750,6 @@ See the documentation for `pyarrow `__ an Write to a parquet file. .. ipython:: python - :okwarning: df.to_parquet('example_pa.parquet', engine='pyarrow') df.to_parquet('example_fp.parquet', engine='fastparquet') @@ -4765,7 +4766,6 @@ Read from a parquet file. Read only certain columns of a parquet file. .. ipython:: python - :okwarning: result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b']) @@ -4788,7 +4788,6 @@ Serializing a ``DataFrame`` to parquet may include the implicit index as one or more columns in the output file. Thus, this code: .. ipython:: python - :okwarning: df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) df.to_parquet('test.parquet', engine='pyarrow') @@ -4805,7 +4804,6 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to :func:`~pandas.DataFrame.to_parquet`: .. ipython:: python - :okwarning: df.to_parquet('test.parquet', index=False) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index e0dc2e734e660..5e026e3a7d78f 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -67,6 +67,7 @@ "df = pd.DataFrame({'A': np.linspace(1, 10, 10)})\n", "df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],\n", " axis=1)\n", + "df.iloc[3, 3] = np.nan\n", "df.iloc[0, 2] = np.nan" ] }, @@ -402,6 +403,38 @@ "df.style.format({\"B\": lambda x: \"±{:.2f}\".format(abs(x))})" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can format the text displayed for missing values by `na_rep`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.style.format(\"{:.2%}\", na_rep=\"-\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These formatting techniques can be used in combination with styling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.style.highlight_max().format(None, na_rep=\"-\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -659,6 +692,7 @@ "- precision\n", "- captions\n", "- table-wide styles\n", + "- missing values representation\n", "- hiding the index or columns\n", "\n", "Each of these can be specified in two ways:\n", @@ -800,6 +834,32 @@ "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Missing values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can control the default missing values representation for the entire table through `set_na_rep` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(df.style\n", + " .set_na_rep(\"FAIL\")\n", + " .format(None, na_rep=\"PASS\", subset=[\"D\"])\n", + " .highlight_null(\"yellow\"))" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index d521c745ccfe5..072871f89bdae 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -13,7 +13,7 @@ Text Data Types .. versionadded:: 1.0.0 -There are two main ways to store text data +There are two ways to store text data in pandas: 1. ``object`` -dtype NumPy array. 2. :class:`StringDtype` extension type. @@ -63,7 +63,40 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s s.astype("string") -Everything that follows in the rest of this document applies equally to +.. _text.differences: + +Behavior differences +^^^^^^^^^^^^^^^^^^^^ + +These are places where the behavior of ``StringDtype`` objects differ from +``object`` dtype + +l. For ``StringDtype``, :ref:`string accessor methods` + that return **numeric** output will always return a nullable integer dtype, + rather than either int or float dtype, depending on the presence of NA values. + + .. ipython:: python + + s = pd.Series(["a", None, "b"], dtype="string") + s + s.str.count("a") + s.dropna().str.count("a") + + Both outputs are ``Int64`` dtype. Compare that with object-dtype + + .. ipython:: python + + s.astype(object).str.count("a") + s.astype(object).dropna().str.count("a") + + When NA values are present, the output dtype is float64. + +2. Some string methods, like :meth:`Series.str.decode` are not available + on ``StringArray`` because ``StringArray`` only holds strings, not + bytes. + + +Everything else that follows in the rest of this document applies equally to ``string`` and ``object`` dtype. .. _text.string_methods: diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index c7278d5a47ba6..e7dc6150ffcb1 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -33,7 +33,7 @@ Check the :ref:`API Changes ` and :ref:`deprecations .. note:: - This is a combined release for 0.20.0 and and 0.20.1. + This is a combined release for 0.20.0 and 0.20.1. Version 0.20.1 contains one additional change for backwards-compatibility with downstream projects using pandas' ``utils`` routines. (:issue:`16250`) .. contents:: What's new in v0.20.0 diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 42579becd4237..85de0150a5a28 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -353,7 +353,7 @@ Example: mi = pd.MultiIndex.from_product([list('AB'), list('CD'), list('EF')], names=['AB', 'CD', 'EF']) - df = pd.DataFrame([i for i in range(len(mi))], index=mi, columns=['N']) + df = pd.DataFrame(list(range(len(mi))), index=mi, columns=['N']) df df.rename_axis(index={'CD': 'New'}) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cd012fe755337..7d11d90eeb670 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -63,7 +63,7 @@ Previously, strings were typically stored in object-dtype NumPy arrays. ``StringDtype`` is currently considered experimental. The implementation and parts of the API may change without warning. -The text extension type solves several issues with object-dtype NumPy arrays: +The ``'string'`` extension type solves several issues with object-dtype NumPy arrays: 1. You can accidentally store a *mixture* of strings and non-strings in an ``object`` dtype array. A ``StringArray`` can only store strings. @@ -88,12 +88,44 @@ You can use the alias ``"string"`` as well. The usual string accessor methods work. Where appropriate, the return type of the Series or columns of a DataFrame will also have string dtype. +.. ipython:: python + s.str.upper() s.str.split('b', expand=True).dtypes +String accessor methods returning integers will return a value with :class:`Int64Dtype` + +.. ipython:: python + + s.str.count("a") + We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. +.. _whatsnew_100.boolean: + +Boolean data type with missing values support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added :class:`BooleanDtype` / :class:`~arrays.BooleanArray`, an extension +type dedicated to boolean data that can hold missing values. With the default +``'bool`` data type based on a numpy bool array, the column can only hold +True or False values and not missing values. This new :class:`BooleanDtype` +can store missing values as well by keeping track of this in a separate mask. +(:issue:`29555`) + +.. ipython:: python + + pd.Series([True, False, None], dtype=pd.BooleanDtype()) + +You can use the alias ``"boolean"`` as well. + +.. ipython:: python + + s = pd.Series([True, False, None], dtype="boolean") + s + + .. _whatsnew_1000.enhancements.other: Other enhancements @@ -114,6 +146,10 @@ Other enhancements - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) +- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`) +- Roundtripping DataFrames with nullable integer or string data types to parquet + (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine + now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). Build Changes ^^^^^^^^^^^^^ @@ -130,34 +166,39 @@ Backwards incompatible API changes .. _whatsnew_1000.api_breaking.MultiIndex._names: -``MultiIndex.levels`` do not hold level names any longer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Avoid using names from ``MultiIndex.levels`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- A :class:`MultiIndex` previously stored the level names as attributes of each of its - :attr:`MultiIndex.levels`. From Pandas 1.0, the names are only accessed through - :attr:`MultiIndex.names` (which was also possible previously). This is done in order to - make :attr:`MultiIndex.levels` more similar to :attr:`CategoricalIndex.categories` (:issue:`27242`:). +As part of a larger refactor to :class:`MultiIndex` the level names are now +stored separately from the levels (:issue:`27242`). We recommend using +:attr:`MultiIndex.names` to access the names, and :meth:`Index.set_names` +to update the names. -*pandas 0.25.x* +For backwards compatibility, you can still *access* the names via the levels. -.. code-block:: ipython +.. ipython:: python - In [1]: mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) - Out[2]: mi - MultiIndex([(1, 'a'), - (1, 'b'), - (2, 'a'), - (2, 'b')], - names=['x', 'y']) - Out[3]: mi.levels[0].name - 'x' + mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi.levels[0].name -*pandas 1.0.0* +However, it is no longer possible to *update* the names of the ``MultiIndex`` +via the name of the level. The following will **silently** fail to update the +name of the ``MultiIndex`` .. ipython:: python - mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) - mi.levels[0].name + mi.levels[0].name = "new name" + mi.names + +To update, use ``MultiIndex.set_names``, which returns a new ``MultiIndex``. + +.. ipython:: python + + mi2 = mi.set_names("new name", level=0) + mi2.names + +New repr for :class:`pandas.core.arrays.IntervalArray` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) @@ -178,11 +219,118 @@ Backwards incompatible API changes pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) + +All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) + +- :meth:`SeriesGroupBy.count` +- :meth:`SeriesGroupBy.size` +- :meth:`SeriesGroupBy.nunique` +- :meth:`SeriesGroupBy.nth` + +.. ipython:: python + + df = pd.DataFrame({ + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + }) + df + + +*pandas 0.25.x* + +.. code-block:: ipython + + In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + Out[2]: + cat_1 cat_2 + A A 1 + B 1 + B A 1 + B 1 + Name: value, dtype: int64 + + +*pandas 1.0.0* + +.. ipython:: python + + df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + + +.. _whatsnew_1000.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some minimum supported versions of dependencies were updated (:issue:`29723`). +If installed, we now require: + ++-----------------+-----------------+----------+ +| Package | Minimum Version | Required | ++=================+=================+==========+ +| numpy | 1.13.3 | X | ++-----------------+-----------------+----------+ +| pytz | 2015.4 | X | ++-----------------+-----------------+----------+ +| python-dateutil | 2.6.1 | X | ++-----------------+-----------------+----------+ +| bottleneck | 1.2.1 | | ++-----------------+-----------------+----------+ +| numexpr | 2.6.2 | | ++-----------------+-----------------+----------+ +| pytest (dev) | 4.0.2 | | ++-----------------+-----------------+----------+ + +For `optional libraries `_ the general recommendation is to use the latest version. +The following table lists the lowest version per library that is currently being tested throughout the development of pandas. +Optional libraries below the lowest tested version may still work, but are not considered supported. + ++-----------------+-----------------+ +| Package | Minimum Version | ++=================+=================+ +| beautifulsoup4 | 4.6.0 | ++-----------------+-----------------+ +| fastparquet | 0.3.2 | ++-----------------+-----------------+ +| gcsfs | 0.2.2 | ++-----------------+-----------------+ +| lxml | 3.8.0 | ++-----------------+-----------------+ +| matplotlib | 2.2.2 | ++-----------------+-----------------+ +| openpyxl | 2.4.8 | ++-----------------+-----------------+ +| pyarrow | 0.9.0 | ++-----------------+-----------------+ +| pymysql | 0.7.1 | ++-----------------+-----------------+ +| pytables | 3.4.2 | ++-----------------+-----------------+ +| scipy | 0.19.0 | ++-----------------+-----------------+ +| sqlalchemy | 1.1.4 | ++-----------------+-----------------+ +| xarray | 0.8.2 | ++-----------------+-----------------+ +| xlrd | 1.1.0 | ++-----------------+-----------------+ +| xlsxwriter | 0.9.8 | ++-----------------+-----------------+ +| xlwt | 1.2.0 | ++-----------------+-----------------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + + .. _whatsnew_1000.api.other: Other API changes ^^^^^^^^^^^^^^^^^ +- Bumpded the minimum supported version of ``s3fs`` from 0.0.8 to 0.3.0 (:issue:`28616`) - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) @@ -195,6 +343,7 @@ Other API changes See :ref:`units registration ` for more. - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter. Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) +- When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - @@ -252,16 +401,44 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. **Other removals** +- Removed the previously deprecated :meth:`Index.summary` (:issue:`18217`) - Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`18529`) +- Removed :meth:`Series.from_array` (:issue:`18258`) +- Removed :meth:`DataFrame.from_items` (:issue:`18458`) +- Removed :meth:`DataFrame.as_matrix`, :meth:`Series.as_matrix` (:issue:`18458`) +- Removed :meth:`Series.asobject` (:issue:`18477`) +- Removed :meth:`DataFrame.as_blocks`, :meth:`Series.as_blocks`, `DataFrame.blocks`, :meth:`Series.blocks` (:issue:`17656`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) +- :func:`core.internals.blocks.make_block` no longer accepts the "fastpath" keyword(:issue:`19265`) +- :meth:`Block.make_block_same_class` no longer accepts the "dtype" keyword(:issue:`19434`) - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) +- Removed support for legacy HDF5 formats (:issue:`29787`) +- :func:`read_excel` removed support for "skip_footer" argument, use "skipfooter" instead (:issue:`18836`) +- :func:`read_excel` no longer allows an integer value for the parameter ``usecols``, instead pass a list of integers from 0 to ``usecols`` inclusive (:issue:`23635`) +- :meth:`DataFrame.to_records` no longer supports the argument "convert_datetime64" (:issue:`18902`) - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) +- Changed the default value for the "keep_tz" argument in :meth:`DatetimeIndex.to_series` to ``True`` (:issue:`23739`) +- Removed the previously deprecated :func:`api.types.is_period` and :func:`api.types.is_datetimetz` (:issue:`23917`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) +- Removed previously deprecated :func:`pandas.tseries.plotting.tsplot` (:issue:`18627`) - Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) - Removed the previously deprecated ``assert_raises_regex`` function in ``pandas.util.testing`` (:issue:`29174`) +- Removed previously deprecated "nthreads" argument from :func:`read_feather`, use "use_threads" instead (:issue:`23053`) - Removed :meth:`Index.is_lexsorted_for_tuple` (:issue:`29305`) +- Removed support for nexted renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`29608`) +- Removed the previously deprecated :meth:`Series.valid`; use :meth:`Series.dropna` instead (:issue:`18800`) +- Removed the previously properties :attr:`DataFrame.is_copy`, :attr:`Series.is_copy` (:issue:`18812`) +- Removed the previously deprecated :meth:`DataFrame.get_ftype_counts`, :meth:`Series.get_ftype_counts` (:issue:`18243`) +- Removed the previously deprecated :meth:`Index.get_duplicated`, use ``idx[idx.duplicated()].unique()`` instead (:issue:`20239`) +- Removed the previously deprecated :meth:`Series.clip_upper`, :meth:`Series.clip_lower`, :meth:`DataFrame.clip_upper`, :meth:`DataFrame.clip_lower` (:issue:`24203`) +- Removed previously deprecated "order" argument from :func:`factorize` (:issue:`19751`) +- Removed previously deprecated "v" argument from :meth:`FrozenNDarray.searchsorted`, use "value" instead (:issue:`22672`) +- :func:`read_stata` and :meth:`DataFrame.to_stata` no longer supports the "encoding" argument (:issue:`21400`) +- Removed previously deprecated "raise_conflict" argument from :meth:`DataFrame.update`, use "errors" instead (:issue:`23585`) +- Removed previously deprecated keyword "n" from :meth:`DatetimeIndex.shift`, :meth:`TimedeltaIndex.shift`, :meth:`PeriodIndex.shift`, use "periods" instead (:issue:`22458`) - .. _whatsnew_1000.performance: @@ -276,6 +453,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) - Performance improvement in :meth:`DataFrame.select_dtypes` by using vectorization instead of iterating over a loop (:issue:`28317`) - Performance improvement in :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` (:issue:`28795`) +- Performance improvement when comparing a :meth:`Categorical` with a scalar and the scalar is not found in the categories (:issue:`29750`) .. _whatsnew_1000.bug_fixes: @@ -297,6 +475,7 @@ Categorical - Using date accessors on a categorical dtyped :class:`Series` of datetimes was not returning an object of the same type as if one used the :meth:`.str.` / :meth:`.dt.` on a :class:`Series` of that type. E.g. when accessing :meth:`Series.dt.tz_localize` on a :class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue: `27952`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) Datetimelike @@ -314,11 +493,11 @@ Datetimelike - Bug in :func:`pandas.core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) -- +- Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`) Timedelta ^^^^^^^^^ - +- Bug in subtracting a :class:`TimedeltaIndex` or :class:`TimedeltaArray` from a ``np.datetime64`` object (:issue:`29558`) - - @@ -335,7 +514,10 @@ Numeric - :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth: `DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) - Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) - Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`) +- Bug in :meth:`Series.var` not computing the right value with a nullable integer dtype series not passing through ddof argument (:issue:`29128`) - Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`) +- Bug in numeric indexes resulted in it being possible to instantiate an :class:`Int64Index`, :class:`UInt64Index`, or :class:`Float64Index` with an invalid dtype (e.g. datetime-like) (:issue:`29539`) +- Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`) - Conversion @@ -347,7 +529,7 @@ Conversion Strings ^^^^^^^ -- +- Calling :meth:`Series.str.isalnum` (and other "ismethods") on an empty Series would return an object dtype instead of bool (:issue:`29624`) - @@ -400,6 +582,9 @@ I/O - Bug in :meth:`DataFrame.to_html` when using ``formatters=`` and ``max_cols`` together. (:issue:`25955`) - Bug in :meth:`Styler.background_gradient` not able to work with dtype ``Int64`` (:issue:`28869`) - Bug in :meth:`DataFrame.to_clipboard` which did not work reliably in ipython (:issue:`22707`) +- Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) +- Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) +- Plotting ^^^^^^^^ @@ -421,23 +606,33 @@ Groupby/resample/rolling - - Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`) +- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty series or dataframe (:issue:`28427`) - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) - Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue: `15584`). - Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue: `19248`). - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) +- Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) +- Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) +- Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.apply` that caused incorrect output with empty :class:`DataFrame` (:issue:`28202`, :issue:`21959`) - Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`) +- Bug in :meth:`pivot_table` not returning correct type ``float`` when ``margins=True`` and ``aggfunc='mean'`` (:issue:`24893`) - Bug :func:`merge_asof` could not use :class:`datetime.timedelta` for ``tolerance`` kwarg (:issue:`28098`) - Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`) - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) +- Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`) +- Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) +- Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`) +- Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) + Sparse ^^^^^^ @@ -461,6 +656,7 @@ Other - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for years after 2030 (now goes up to 2200) (:issue:`27790`) diff --git a/environment.yml b/environment.yml index e9ac76f5bc52c..848825c37a160 100644 --- a/environment.yml +++ b/environment.yml @@ -15,13 +15,13 @@ dependencies: - cython>=0.29.13 # code checks - - black<=19.3b0 + - black=19.10b0 - cpplint - flake8 - - flake8-comprehensions # used by flake8, linting of unnecessary comprehensions + - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - isort # check that imports are in the right order - - mypy=0.720 + - mypy=0.730 - pycodestyle # used by flake8 # documentation @@ -51,9 +51,9 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 - - pytest>=4.0.2 + - pytest>=5.0.1 - pytest-cov - - pytest-xdist + - pytest-xdist>=1.21 - seaborn - statsmodels @@ -75,7 +75,7 @@ dependencies: # optional for io - beautifulsoup4>=4.6.0 # pandas.read_html - - fastparquet>=0.2.1 # pandas.read_parquet, DataFrame.to_parquet + - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - html5lib # pandas.read_html - lxml # pandas.read_html - openpyxl # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile diff --git a/pandas/__init__.py b/pandas/__init__.py index 5d163e411c0ac..cd697b757a26a 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -67,6 +67,7 @@ IntervalDtype, DatetimeTZDtype, StringDtype, + BooleanDtype, # missing isna, isnull, diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 890db5b41907e..814f855cceeac 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -58,16 +58,16 @@ RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb") # holds deprecated option metdata -_deprecated_options = {} # type: Dict[str, DeprecatedOption] +_deprecated_options: Dict[str, DeprecatedOption] = {} # holds registered option metdata -_registered_options = {} # type: Dict[str, RegisteredOption] +_registered_options: Dict[str, RegisteredOption] = {} # holds the current values for registered options -_global_config = {} # type: Dict[str, str] +_global_config: Dict[str, str] = {} # keys which have a special meaning -_reserved_keys = ["all"] # type: List[str] +_reserved_keys: List[str] = ["all"] class OptionError(AttributeError, KeyError): diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index d3b5ecfdaa178..1906193622953 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -75,7 +75,7 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): lens[i] = l cdata = data - # keep the references alive thru the end of the + # keep the references alive through the end of the # function datas.append(data) vecs[i] = cdata diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c39d6d60d4ea5..b207fcb66948d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -100,7 +100,7 @@ cdef class {{name}}Vector: PyMem_Free(self.data) self.data = NULL - def __len__(self): + def __len__(self) -> int: return self.data.n cpdef to_array(self): @@ -168,7 +168,7 @@ cdef class StringVector: PyMem_Free(self.data) self.data = NULL - def __len__(self): + def __len__(self) -> int: return self.data.n def to_array(self): @@ -212,7 +212,7 @@ cdef class ObjectVector: self.ao = np.empty(_INIT_VEC_CAP, dtype=object) self.data = self.ao.data - def __len__(self): + def __len__(self) -> int: return self.n cdef inline append(self, object obj): @@ -270,7 +270,7 @@ cdef class {{name}}HashTable(HashTable): size_hint = min(size_hint, _SIZE_HINT_LIMIT) kh_resize_{{dtype}}(self.table, size_hint) - def __len__(self): + def __len__(self) -> int: return self.table.size def __dealloc__(self): @@ -897,7 +897,7 @@ cdef class PyObjectHashTable(HashTable): kh_destroy_pymap(self.table) self.table = NULL - def __len__(self): + def __len__(self) -> int: return self.table.size def __contains__(self, object key): diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index c4284ae403e5c..f8f3858b803a5 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -144,13 +144,13 @@ def duplicated_{{dtype}}({{c_type}}[:] values, object keep='first'): if keep == 'last': {{if dtype == 'object'}} for i in range(n - 1, -1, -1): - # equivalent: range(n)[::-1], which cython doesnt like in nogil + # equivalent: range(n)[::-1], which cython doesn't like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: for i in range(n - 1, -1, -1): - # equivalent: range(n)[::-1], which cython doesnt like in nogil + # equivalent: range(n)[::-1], which cython doesn't like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{endif}} diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 308e914b7b5b7..7a25a52c7e608 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -11,7 +11,7 @@ cdef class _NDFrameIndexerBase: self._ndim = None @property - def ndim(self): + def ndim(self) -> int: # Delay `ndim` instantiation until required as reading it # from `obj` isn't entirely cheap. ndim = self._ndim diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index ecd090de500da..ba108c4524b9c 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,7 +1,7 @@ import cython from cython import Py_ssize_t -from cpython.object cimport PyObject +from cpython.slice cimport PySlice_GetIndicesEx cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX @@ -9,13 +9,6 @@ cdef extern from "Python.h": import numpy as np from numpy cimport int64_t -cdef extern from "compat_helper.h": - cdef int slice_get_indices(PyObject* s, Py_ssize_t length, - Py_ssize_t *start, Py_ssize_t *stop, - Py_ssize_t *step, - Py_ssize_t *slicelength) except -1 - - from pandas._libs.algos import ensure_int64 @@ -66,7 +59,7 @@ cdef class BlockPlacement: def __repr__(self) -> str: return str(self) - def __len__(self): + def __len__(self) -> int: cdef: slice s = self._ensure_has_slice() if s is not None: @@ -85,7 +78,7 @@ cdef class BlockPlacement: return iter(self._as_array) @property - def as_slice(self): + def as_slice(self) -> slice: cdef: slice s = self._ensure_has_slice() if s is None: @@ -118,7 +111,7 @@ cdef class BlockPlacement: return self._as_array @property - def is_slice_like(self): + def is_slice_like(self) -> bool: cdef: slice s = self._ensure_has_slice() return s is not None @@ -258,8 +251,8 @@ cpdef Py_ssize_t slice_len( if slc is None: raise TypeError("slc must be slice") - slice_get_indices(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) return length @@ -278,8 +271,8 @@ cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): if slc is None: raise TypeError("slc should be a slice") - slice_get_indices(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) return start, stop, step, length @@ -441,7 +434,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): yield blkno, result -def get_blkno_placements(blknos, group=True): +def get_blkno_placements(blknos, group: bool = True): """ Parameters diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7d65cb52bce1e..aaf6456df8f8e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -971,6 +971,7 @@ cdef class Seen: bint nat_ # seen nat bint bool_ # seen_bool bint null_ # seen_null + bint nan_ # seen_np.nan bint uint_ # seen_uint (unsigned integer) bint sint_ # seen_sint (signed integer) bint float_ # seen_float @@ -995,6 +996,7 @@ cdef class Seen: self.nat_ = 0 self.bool_ = 0 self.null_ = 0 + self.nan_ = 0 self.uint_ = 0 self.sint_ = 0 self.float_ = 0 @@ -1953,10 +1955,37 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint safe=0, bint convert_datetime=0, - bint convert_timedelta=0): + bint convert_timedelta=0, + bint convert_to_nullable_integer=0): """ Type inference function-- convert object array to proper dtype + + Parameters + ---------- + values : ndarray + Array of object elements to convert. + try_float : bool, default False + If an array-like object contains only float or NaN values is + encountered, whether to convert and return an array of float dtype. + safe : bool, default False + Whether to upcast numeric type (e.g. int cast to float). If set to + True, no upcasting will be performed. + convert_datetime : bool, default False + If an array-like object contains only datetime values or NaT is + encountered, whether to convert and return an array of M8[ns] dtype. + convert_timedelta : bool, default False + If an array-like object contains only timedelta values or NaT is + encountered, whether to convert and return an array of m8[ns] dtype. + convert_to_nullable_integer : bool, default False + If an array-like object contains only interger values (and NaN) is + encountered, whether to convert and return an IntegerArray. + + Returns + ------- + array : array of converted object values to more specific dtypes if + pplicable """ + cdef: Py_ssize_t i, n ndarray[float64_t] floats @@ -1977,6 +2006,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, ints = np.empty(n, dtype='i8') uints = np.empty(n, dtype='u8') bools = np.empty(n, dtype=np.uint8) + mask = np.full(n, False) if convert_datetime: datetimes = np.empty(n, dtype='M8[ns]') @@ -1994,6 +2024,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if val is None: seen.null_ = 1 floats[i] = complexes[i] = fnan + mask[i] = True elif val is NaT: seen.nat_ = 1 if convert_datetime: @@ -2003,6 +2034,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not (convert_datetime or convert_timedelta): seen.object_ = 1 break + elif val is np.nan: + seen.nan_ = 1 + mask[i] = True + floats[i] = complexes[i] = val elif util.is_bool_object(val): seen.bool_ = 1 bools[i] = val @@ -2084,11 +2119,19 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not seen.object_: if not safe: - if seen.null_: + if seen.null_ or seen.nan_: if seen.is_float_or_complex: if seen.complex_: return complexes - elif seen.float_ or seen.int_: + elif seen.float_: + return floats + elif seen.int_: + if convert_to_nullable_integer: + from pandas.core.arrays import IntegerArray + return IntegerArray(ints, mask) + else: + return floats + elif seen.nan_: return floats else: if not seen.bool_: @@ -2127,7 +2170,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if seen.complex_: if not seen.int_: return complexes - elif seen.float_: + elif seen.float_ or seen.nan_: if not seen.int_: return floats else: @@ -2151,7 +2194,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if seen.complex_: if not seen.int_: return complexes - elif seen.float_: + elif seen.float_ or seen.nan_: if not seen.int_: return floats elif seen.int_: @@ -2165,9 +2208,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects +_no_default = object() + + @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1): +def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, + object na_value=_no_default, object dtype=object): """ Substitute for np.vectorize with pandas-friendly dtype inference @@ -2175,6 +2222,15 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1) ---------- arr : ndarray f : function + mask : ndarray + uint8 dtype ndarray indicating values not to apply `f` to. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray + na_value : Any, optional + The result value to use for masked values. By default, the + input value is used + dtype : numpy.dtype + The numpy dtype to use for the result ndarray. Returns ------- @@ -2182,14 +2238,17 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1) """ cdef: Py_ssize_t i, n - ndarray[object] result + ndarray result object val n = len(arr) - result = np.empty(n, dtype=object) + result = np.empty(n, dtype=dtype) for i in range(n): if mask[i]: - val = arr[i] + if na_value is _no_default: + val = arr[i] + else: + val = na_value else: val = f(arr[i]) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 601b81556be0e..8f0f4e17df2f9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -589,8 +589,7 @@ cdef class TextReader: if not isinstance(quote_char, (str, bytes)) and quote_char is not None: dtype = type(quote_char).__name__ - raise TypeError('"quotechar" must be string, ' - 'not {dtype}'.format(dtype=dtype)) + raise TypeError(f'"quotechar" must be string, not {dtype}') if quote_char is None or quote_char == '': if quoting != QUOTE_NONE: @@ -685,7 +684,7 @@ cdef class TextReader: if not os.path.exists(source): raise FileNotFoundError( ENOENT, - 'File {source} does not exist'.format(source=source), + f'File {source} does not exist', source) raise IOError('Initializing from file failed') @@ -741,8 +740,8 @@ cdef class TextReader: self.parser.lines < hr): msg = self.orig_header if isinstance(msg, list): - msg = "[%s], len of %d," % ( - ','.join(str(m) for m in msg), len(msg)) + joined = ','.join(str(m) for m in msg) + msg = f"[{joined}], len of {len(msg)}," raise ParserError( f'Passed header={msg} but only ' f'{self.parser.lines} lines in file') @@ -768,10 +767,9 @@ cdef class TextReader: if name == '': if self.has_mi_columns: - name = ('Unnamed: {i}_level_{lvl}' - .format(i=i, lvl=level)) + name = f'Unnamed: {i}_level_{level}' else: - name = 'Unnamed: {i}'.format(i=i) + name = f'Unnamed: {i}' unnamed_count += 1 count = counts.get(name, 0) @@ -845,11 +843,6 @@ cdef class TextReader: passed_count = len(header[0]) - # if passed_count > field_count: - # raise ParserError('Column names have %d fields, ' - # 'data has %d fields' - # % (passed_count, field_count)) - if (self.has_usecols and self.allow_leading_cols and not callable(self.usecols)): nuse = len(self.usecols) @@ -990,7 +983,7 @@ cdef class TextReader: cdef _end_clock(self, what): if self.verbose: elapsed = time.time() - self.clocks.pop(-1) - print('%s took: %.2f ms' % (what, elapsed * 1000)) + print(f'{what} took: {elapsed * 1000:.2f} ms') def set_noconvert(self, i): self.noconvert.add(i) @@ -1028,11 +1021,9 @@ cdef class TextReader: (num_cols >= self.parser.line_fields[i]) * num_cols if self.table_width - self.leading_cols > num_cols: - raise ParserError( - "Too many columns specified: expected {expected} and " - "found {found}" - .format(expected=self.table_width - self.leading_cols, - found=num_cols)) + raise ParserError(f"Too many columns specified: expected " + f"{self.table_width - self.leading_cols} " + f"and found {num_cols}") results = {} nused = 0 @@ -1075,9 +1066,9 @@ cdef class TextReader: if conv: if col_dtype is not None: - warnings.warn(("Both a converter and dtype were specified " - "for column {0} - only the converter will " - "be used").format(name), ParserWarning, + warnings.warn((f"Both a converter and dtype were specified " + f"for column {name} - only the converter will " + f"be used"), ParserWarning, stacklevel=5) results[i] = _apply_converter(conv, self.parser, i, start, end, self.c_encoding) @@ -1118,7 +1109,7 @@ cdef class TextReader: col_res = _maybe_upcast(col_res) if col_res is None: - raise ParserError('Unable to parse column {i}'.format(i=i)) + raise ParserError(f'Unable to parse column {i}') results[i] = col_res @@ -1178,12 +1169,9 @@ cdef class TextReader: col_res = col_res.astype(col_dtype) if (col_res != col_res_orig).any(): raise ValueError( - "cannot safely convert passed user dtype of " - "{col_dtype} for {col_res} dtyped data in " - "column {column}".format( - col_dtype=col_dtype, - col_res=col_res_orig.dtype.name, - column=i)) + f"cannot safely convert passed user dtype of " + f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in " + f"column {i}") return col_res, na_count @@ -1216,9 +1204,9 @@ cdef class TextReader: dtype=dtype) except NotImplementedError: raise NotImplementedError( - "Extension Array: {ea} must implement " - "_from_sequence_of_strings in order " - "to be used in parser methods".format(ea=array_type)) + f"Extension Array: {array_type} must implement " + f"_from_sequence_of_strings in order " + f"to be used in parser methods") return result, na_count @@ -1228,8 +1216,7 @@ cdef class TextReader: end, na_filter, na_hashset) if user_dtype and na_count is not None: if na_count > 0: - raise ValueError("Integer column has NA values in " - "column {column}".format(column=i)) + raise ValueError(f"Integer column has NA values in column {i}") except OverflowError: result = _try_uint64(self.parser, i, start, end, na_filter, na_hashset) @@ -1253,8 +1240,7 @@ cdef class TextReader: self.true_set, self.false_set) if user_dtype and na_count is not None: if na_count > 0: - raise ValueError("Bool column has NA values in " - "column {column}".format(column=i)) + raise ValueError(f"Bool column has NA values in column {i}") return result, na_count elif dtype.kind == 'S': @@ -1270,8 +1256,7 @@ cdef class TextReader: elif dtype.kind == 'U': width = dtype.itemsize if width > 0: - raise TypeError("the dtype {dtype} is not " - "supported for parsing".format(dtype=dtype)) + raise TypeError(f"the dtype {dtype} is not supported for parsing") # unicode variable width return self._string_convert(i, start, end, na_filter, @@ -1280,12 +1265,11 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) elif is_datetime64_dtype(dtype): - raise TypeError("the dtype {dtype} is not supported " - "for parsing, pass this column " - "using parse_dates instead".format(dtype=dtype)) + raise TypeError(f"the dtype {dtype} is not supported " + f"for parsing, pass this column " + f"using parse_dates instead") else: - raise TypeError("the dtype {dtype} is not " - "supported for parsing".format(dtype=dtype)) + raise TypeError(f"the dtype {dtype} is not supported for parsing") cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, bint na_filter, kh_str_starts_t *na_hashset): @@ -1422,59 +1406,6 @@ cdef inline StringPath _string_path(char *encoding): # Type conversions / inference support code -cdef _string_box_factorize(parser_t *parser, int64_t col, - int64_t line_start, int64_t line_end, - bint na_filter, kh_str_starts_t *na_hashset): - cdef: - int error, na_count = 0 - Py_ssize_t i, lines - coliter_t it - const char *word = NULL - ndarray[object] result - - int ret = 0 - kh_strbox_t *table - - object pyval - - object NA = na_values[np.object_] - khiter_t k - - table = kh_init_strbox() - lines = line_end - line_start - result = np.empty(lines, dtype=np.object_) - coliter_setup(&it, parser, col, line_start) - - for i in range(lines): - COLITER_NEXT(it, word) - - if na_filter: - if kh_get_str_starts_item(na_hashset, word): - # in the hash table - na_count += 1 - result[i] = NA - continue - - k = kh_get_strbox(table, word) - - # in the hash table - if k != table.n_buckets: - # this increments the refcount, but need to test - pyval = table.vals[k] - else: - # box it. new ref? - pyval = PyBytes_FromString(word) - - k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval - - result[i] = pyval - - kh_destroy_strbox(table) - - return result, na_count - - cdef _string_box_utf8(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset): @@ -2132,7 +2063,7 @@ cdef raise_parser_error(object base, parser_t *parser): Py_XDECREF(type) raise old_exc - message = '{base}. C error: '.format(base=base) + message = f'{base}. C error: ' if parser.error_msg != NULL: message += parser.error_msg.decode('utf-8') else: diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index e6e658c0c6979..ea54b00cf5be4 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -18,15 +18,13 @@ cimport pandas._libs.util as util from pandas._libs.lib import maybe_convert_objects -cdef _get_result_array(object obj, Py_ssize_t size, Py_ssize_t cnt): +cdef _check_result_array(object obj, Py_ssize_t cnt): if (util.is_array(obj) or (isinstance(obj, list) and len(obj) == cnt) or getattr(obj, 'shape', None) == (cnt,)): raise ValueError('Function does not reduce') - return np.empty(size, dtype='O') - cdef bint _is_sparse_array(object obj): # TODO can be removed one SparseArray.values is removed (GH26421) @@ -83,12 +81,10 @@ cdef class Reducer: else: - # we passed a series-like - if hasattr(dummy, 'values'): - - typ = type(dummy) - index = getattr(dummy, 'index', None) - dummy = dummy.values + # we passed a Series + typ = type(dummy) + index = dummy.index + dummy = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') @@ -101,10 +97,9 @@ cdef class Reducer: cdef: char* dummy_buf ndarray arr, result, chunk - Py_ssize_t i, incr + Py_ssize_t i flatiter it - bint has_labels - object res, name, labels, index + object res, name, labels object cached_typ = None arr = self.arr @@ -112,40 +107,26 @@ cdef class Reducer: dummy_buf = chunk.data chunk.data = arr.data labels = self.labels - has_labels = labels is not None - has_index = self.index is not None - incr = self.increment + + result = np.empty(self.nresults, dtype='O') + it = PyArray_IterNew(result) try: for i in range(self.nresults): - if has_labels: - name = labels[i] - else: - name = None - # create the cached type # each time just reassign the data if i == 0: if self.typ is not None: - - # recreate with the index if supplied - if has_index: - - cached_typ = self.typ( - chunk, index=self.index, name=name) - - else: - - # use the passsed typ, sans index - cached_typ = self.typ(chunk, name=name) + # In this case, we also have self.index + name = labels[i] + cached_typ = self.typ(chunk, index=self.index, name=name) # use the cached_typ if possible if cached_typ is not None: - - if has_index: - object.__setattr__(cached_typ, 'index', self.index) + # In this case, we also have non-None labels + name = labels[i] object.__setattr__( cached_typ._data._block, 'values', chunk) @@ -154,14 +135,12 @@ cdef class Reducer: else: res = self.f(chunk) - if (not _is_sparse_array(res) and hasattr(res, 'values') - and util.is_array(res.values)): - res = res.values + # TODO: reason for not squeezing here? + res = _extract_result(res, squeeze=False) if i == 0: - result = _get_result_array(res, - self.nresults, - len(self.dummy)) - it = PyArray_IterNew(result) + # On the first pass, we check the output shape to see + # if this looks like a reduction. + _check_result_array(res, len(self.dummy)) PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) chunk.data = chunk.data + self.increment @@ -170,9 +149,7 @@ cdef class Reducer: # so we don't free the wrong memory chunk.data = dummy_buf - if result.dtype == np.object_: - result = maybe_convert_objects(result) - + result = maybe_convert_objects(result) return result @@ -194,6 +171,44 @@ cdef class _BaseGrouper: return values, index + cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp, + Slider islider, Slider vslider): + if cached_typ is None: + cached_ityp = self.ityp(islider.buf) + cached_typ = self.typ(vslider.buf, index=cached_ityp, name=self.name) + else: + # See the comment in indexes/base.py about _index_data. + # We need this for EA-backed indexes that have a reference + # to a 1-d ndarray like datetime / timedelta / period. + object.__setattr__(cached_ityp, '_index_data', islider.buf) + cached_ityp._engine.clear_mapping() + object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__(cached_typ, '_index', cached_ityp) + object.__setattr__(cached_typ, 'name', self.name) + + return cached_typ, cached_ityp + + cdef inline object _apply_to_group(self, + object cached_typ, object cached_ityp, + Slider islider, Slider vslider, + Py_ssize_t group_size, bint initialized): + """ + Call self.f on our new group, then update to the next group. + """ + cached_ityp._engine.clear_mapping() + res = self.f(cached_typ) + res = _extract_result(res) + if not initialized: + # On the first pass, we check the output shape to see + # if this looks like a reduction. + initialized = 1 + _check_result_array(res, len(self.dummy_arr)) + + islider.advance(group_size) + vslider.advance(group_size) + + return res, initialized + cdef class SeriesBinGrouper(_BaseGrouper): """ @@ -209,6 +224,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): def __init__(self, object series, object f, object bins, object dummy): assert dummy is not None # always obj[:0] + assert len(bins) > 0 # otherwise we get IndexError in get_result self.bins = bins self.f = f @@ -221,7 +237,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series, 'name', None) + self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) @@ -239,7 +255,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): object res bint initialized = 0 Slider vslider, islider - object name, cached_typ = None, cached_ityp = None + object cached_typ = None, cached_ityp = None counts = np.zeros(self.ngroups, dtype=np.int64) @@ -253,11 +269,12 @@ cdef class SeriesBinGrouper(_BaseGrouper): group_size = 0 n = len(self.arr) - name = self.name vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) + result = np.empty(self.ngroups, dtype='O') + try: for i in range(self.ngroups): group_size = counts[i] @@ -265,42 +282,21 @@ cdef class SeriesBinGrouper(_BaseGrouper): islider.set_length(group_size) vslider.set_length(group_size) - if cached_typ is None: - cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, - name=name) - else: - # See the comment in indexes/base.py about _index_data. - # We need this for EA-backed indexes that have a reference - # to a 1-d ndarray like datetime / timedelta / period. - object.__setattr__(cached_ityp, '_index_data', islider.buf) - cached_ityp._engine.clear_mapping() - object.__setattr__( - cached_typ._data._block, 'values', vslider.buf) - object.__setattr__(cached_typ, '_index', cached_ityp) - object.__setattr__(cached_typ, 'name', name) + cached_typ, cached_ityp = self._update_cached_objs( + cached_typ, cached_ityp, islider, vslider) - cached_ityp._engine.clear_mapping() - res = self.f(cached_typ) - res = _extract_result(res) - if not initialized: - initialized = 1 - result = _get_result_array(res, - self.ngroups, - len(self.dummy_arr)) - result[i] = res + res, initialized = self._apply_to_group(cached_typ, cached_ityp, + islider, vslider, + group_size, initialized) - islider.advance(group_size) - vslider.advance(group_size) + result[i] = res finally: # so we don't free the wrong memory islider.reset() vslider.reset() - if result.dtype == np.object_: - result = maybe_convert_objects(result) - + result = maybe_convert_objects(result) return result, counts @@ -323,6 +319,10 @@ cdef class SeriesGrouper(_BaseGrouper): # safer obj._get_values(slice(None, 0)) assert dummy is not None + if len(series) == 0: + # get_result would never assign `result` + raise ValueError("SeriesGrouper requires non-empty `series`") + self.labels = labels self.f = f @@ -334,7 +334,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series, 'name', None) + self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) self.ngroups = ngroups @@ -348,17 +348,18 @@ cdef class SeriesGrouper(_BaseGrouper): object res bint initialized = 0 Slider vslider, islider - object name, cached_typ = None, cached_ityp = None + object cached_typ = None, cached_ityp = None labels = self.labels counts = np.zeros(self.ngroups, dtype=np.int64) group_size = 0 n = len(self.arr) - name = self.name vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) + result = np.empty(self.ngroups, dtype='O') + try: for i in range(n): group_size += 1 @@ -375,32 +376,15 @@ cdef class SeriesGrouper(_BaseGrouper): islider.set_length(group_size) vslider.set_length(group_size) - if cached_typ is None: - cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, - name=name) - else: - object.__setattr__(cached_ityp, '_data', islider.buf) - cached_ityp._engine.clear_mapping() - object.__setattr__( - cached_typ._data._block, 'values', vslider.buf) - object.__setattr__(cached_typ, '_index', cached_ityp) - object.__setattr__(cached_typ, 'name', name) - - cached_ityp._engine.clear_mapping() - res = self.f(cached_typ) - res = _extract_result(res) - if not initialized: - initialized = 1 - result = _get_result_array(res, - self.ngroups, - len(self.dummy_arr)) + cached_typ, cached_ityp = self._update_cached_objs( + cached_typ, cached_ityp, islider, vslider) + + res, initialized = self._apply_to_group(cached_typ, cached_ityp, + islider, vslider, + group_size, initialized) result[lab] = res counts[lab] = group_size - islider.advance(group_size) - vslider.advance(group_size) - group_size = 0 finally: @@ -408,27 +392,26 @@ cdef class SeriesGrouper(_BaseGrouper): islider.reset() vslider.reset() - if result is None: - raise ValueError("No result.") + # We check for empty series in the constructor, so should always + # have result initialized by this point. + assert initialized, "`result` has not been initialized." - if result.dtype == np.object_: - result = maybe_convert_objects(result) + result = maybe_convert_objects(result) return result, counts -cdef inline _extract_result(object res): +cdef inline _extract_result(object res, bint squeeze=True): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ if (not _is_sparse_array(res) and hasattr(res, 'values') and util.is_array(res.values)): res = res.values - if not np.isscalar(res): - if util.is_array(res): - if res.ndim == 0: - res = res.item() - elif res.ndim == 1 and len(res) == 1: - res = res[0] + if util.is_array(res): + if res.ndim == 0: + res = res.item() + elif squeeze and res.ndim == 1 and len(res) == 1: + res = res[0] return res @@ -492,8 +475,8 @@ def apply_frame_axis0(object frame, object f, object names, object piece dict item_cache - if frame.index._has_complex_internals: - raise InvalidApply('Cannot modify frame index internals') + # We have already checked that we don't have a MultiIndex before calling + assert frame.index.nlevels == 1 results = [] @@ -610,21 +593,25 @@ cdef class BlockSlider: arr.shape[1] = 0 -def compute_reduction(arr, f, axis=0, dummy=None, labels=None): +def compute_reduction(arr: np.ndarray, f, axis: int = 0, dummy=None, labels=None): """ Parameters ----------- - arr : NDFrame object + arr : np.ndarray f : function axis : integer axis dummy : type of reduced output (series) labels : Index or None """ + # We either have both dummy and labels, or neither of them + if (labels is None) ^ (dummy is None): + raise ValueError("Must pass either dummy and labels, or neither") + if labels is not None: # Caller is responsible for ensuring we don't have MultiIndex - assert not labels._has_complex_internals + assert labels.nlevels == 1 # pass as an ndarray/ExtensionArray labels = labels._values diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 578995a3eb3b6..ee83901040b36 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -57,7 +57,7 @@ cdef class IntIndex(SparseIndex): return output @property - def nbytes(self): + def nbytes(self) -> int: return self.indices.nbytes def check_integrity(self): @@ -91,7 +91,7 @@ cdef class IntIndex(SparseIndex): if not monotonic: raise ValueError("Indices must be strictly increasing") - def equals(self, other): + def equals(self, other) -> bool: if not isinstance(other, IntIndex): return False @@ -103,7 +103,7 @@ cdef class IntIndex(SparseIndex): return same_length and same_indices @property - def ngaps(self): + def ngaps(self) -> int: return self.length - self.npoints def to_int_index(self): @@ -348,11 +348,11 @@ cdef class BlockIndex(SparseIndex): return output @property - def nbytes(self): + def nbytes(self) -> int: return self.blocs.nbytes + self.blengths.nbytes @property - def ngaps(self): + def ngaps(self) -> int: return self.length - self.npoints cpdef check_integrity(self): @@ -388,7 +388,7 @@ cdef class BlockIndex(SparseIndex): if blengths[i] == 0: raise ValueError(f'Zero-length block {i}') - def equals(self, other): + def equals(self, other) -> bool: if not isinstance(other, BlockIndex): return False diff --git a/pandas/_libs/src/compat_helper.h b/pandas/_libs/src/compat_helper.h deleted file mode 100644 index 078069fb48af2..0000000000000 --- a/pandas/_libs/src/compat_helper.h +++ /dev/null @@ -1,50 +0,0 @@ -/* -Copyright (c) 2016, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. -*/ - -#ifndef PANDAS__LIBS_SRC_COMPAT_HELPER_H_ -#define PANDAS__LIBS_SRC_COMPAT_HELPER_H_ - -#include "Python.h" -#include "inline_helper.h" - -/* -PySlice_GetIndicesEx changes signature in PY3 -but 3.6.1 in particular changes the behavior of this function slightly -https://bugs.python.org/issue27867 - - -In 3.6.1 PySlice_GetIndicesEx was changed to a macro -inadvertently breaking ABI compat. For now, undefing -the macro, which restores compat. -https://github.com/pandas-dev/pandas/issues/15961 -https://bugs.python.org/issue29943 -*/ - -#ifndef PYPY_VERSION -# if PY_VERSION_HEX < 0x03070000 && defined(PySlice_GetIndicesEx) -# undef PySlice_GetIndicesEx -# endif // PY_VERSION_HEX -#endif // PYPY_VERSION - -PANDAS_INLINE int slice_get_indices(PyObject *s, - Py_ssize_t length, - Py_ssize_t *start, - Py_ssize_t *stop, - Py_ssize_t *step, - Py_ssize_t *slicelength) { -#if PY_VERSION_HEX >= 0x03000000 - return PySlice_GetIndicesEx(s, length, start, stop, - step, slicelength); -#else - return PySlice_GetIndicesEx((PySliceObject *)s, length, start, - stop, step, slicelength); -#endif // PY_VERSION_HEX -} - -#endif // PANDAS__LIBS_SRC_COMPAT_HELPER_H_ diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 0a767dd27b658..7fbe7a04d5b22 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -11,8 +11,6 @@ The full license is in the LICENSE file, distributed with this software. #define PANDAS__LIBS_SRC_PARSE_HELPER_H_ #include -#include "inline_helper.h" -#include "headers/portable.h" #include "parser/tokenizer.h" int to_double(char *item, double *p_value, char sci, char decimal, @@ -94,12 +92,4 @@ int floatify(PyObject *str, double *result, int *maybe_int) { return -1; } -PANDAS_INLINE void lowercase(char *p) { - for (; *p; ++p) *p = tolower_ascii(*p); -} - -PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper_ascii(*p); -} - #endif // PANDAS__LIBS_SRC_PARSE_HELPER_H_ diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 5d73230f32955..1e3295fcb6fc7 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -9,7 +9,6 @@ The full license is in the LICENSE file, distributed with this software. #include "io.h" -#include #include #include @@ -17,7 +16,7 @@ The full license is in the LICENSE file, distributed with this software. #define O_BINARY 0 #endif // O_BINARY -#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32) +#ifdef _WIN32 #define USE_WIN_UTF16 #include #endif diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2752fb6424022..578f72112d02d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -25,19 +25,6 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include "../headers/portable.h" -static void *safe_realloc(void *buffer, size_t size) { - void *result; - // OSX is weird. - // http://stackoverflow.com/questions/9560609/ - // different-realloc-behaviour-in-linux-and-osx - - result = realloc(buffer, size); - TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, - result)) - - return result; -} - void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { // column i, starting at 0 self->words = parser->words; @@ -45,18 +32,6 @@ void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { self->line_start = parser->line_start + start; } -coliter_t *coliter_new(parser_t *self, int i) { - // column i, starting at 0 - coliter_t *iter = (coliter_t *)malloc(sizeof(coliter_t)); - - if (NULL == iter) { - return NULL; - } - - coliter_setup(iter, self, i, 0); - return iter; -} - static void free_if_not_null(void **ptr) { TRACE(("free_if_not_null %p\n", *ptr)) if (*ptr != NULL) { @@ -80,7 +55,7 @@ static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, while ((length + space >= cap) && (newbuffer != NULL)) { cap = cap ? cap << 1 : 2; buffer = newbuffer; - newbuffer = safe_realloc(newbuffer, elsize * cap); + newbuffer = realloc(newbuffer, elsize * cap); } if (newbuffer == NULL) { @@ -321,8 +296,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { ("make_stream_space: cap != self->words_cap, nbytes = %d, " "self->words_cap=%d\n", nbytes, self->words_cap)) - newptr = safe_realloc((void *)self->word_starts, - sizeof(int64_t) * self->words_cap); + newptr = realloc((void *)self->word_starts, + sizeof(int64_t) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -349,8 +324,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { if (cap != self->lines_cap) { TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) - newptr = safe_realloc((void *)self->line_fields, - sizeof(int64_t) * self->lines_cap); + newptr = realloc((void *)self->line_fields, + sizeof(int64_t) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -427,7 +402,7 @@ static void append_warning(parser_t *self, const char *msg) { snprintf(self->warn_msg, length + 1, "%s", msg); } else { ex_length = strlen(self->warn_msg); - newptr = safe_realloc(self->warn_msg, ex_length + length + 1); + newptr = realloc(self->warn_msg, ex_length + length + 1); if (newptr != NULL) { self->warn_msg = (char *)newptr; snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); @@ -1290,13 +1265,13 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *)); + newptr = realloc((void *)self->words, new_cap * sizeof(char *)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->words = (char **)newptr; } - newptr = safe_realloc((void *)self->word_starts, + newptr = realloc((void *)self->word_starts, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -1315,13 +1290,13 @@ int parser_trim_buffers(parser_t *self) { if (new_cap < self->stream_cap) { TRACE( ("parser_trim_buffers: new_cap < self->stream_cap, calling " - "safe_realloc\n")); - newptr = safe_realloc((void *)self->stream, new_cap); + "realloc\n")); + newptr = realloc((void *)self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { // Update the pointers in the self->words array (char **) if - // `safe_realloc` + // `realloc` // moved the `self->stream` buffer. This block mirrors a similar // block in // `make_stream_space`. @@ -1342,14 +1317,14 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = safe_realloc((void *)self->line_start, + newptr = realloc((void *)self->line_start, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->line_start = (int64_t *)newptr; } - newptr = safe_realloc((void *)self->line_fields, + newptr = realloc((void *)self->line_fields, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -1426,42 +1401,30 @@ int tokenize_all_rows(parser_t *self) { return status; } -PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper_ascii(*p); -} - +/* + * Function: to_boolean + * -------------------- + * + * Validate if item should be recognized as a boolean field. + * + * item: const char* representing parsed text + * val : pointer to a uint8_t of boolean representation + * + * If item is determined to be boolean, this method will set + * the appropriate value of val and return 0. A non-zero exit + * status means that item was not inferred to be boolean, and + * leaves the value of *val unmodified. + */ int to_boolean(const char *item, uint8_t *val) { - char *tmp; - int i, status = 0; - size_t length0 = (strlen(item) + 1); - int bufsize = length0; - - static const char *tstrs[1] = {"TRUE"}; - static const char *fstrs[1] = {"FALSE"}; - - tmp = malloc(bufsize); - snprintf(tmp, length0, "%s", item); - uppercase(tmp); - - for (i = 0; i < 1; ++i) { - if (strcmp(tmp, tstrs[i]) == 0) { - *val = 1; - goto done; - } + if (strcasecmp(item, "TRUE") == 0) { + *val = 1; + return 0; + } else if (strcasecmp(item, "FALSE") == 0) { + *val = 0; + return 0; } - for (i = 0; i < 1; ++i) { - if (strcmp(tmp, fstrs[i]) == 0) { - *val = 0; - goto done; - } - } - - status = -1; - -done: - free(tmp); - return status; + return -1; } // --------------------------------------------------------------------------- diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 66ef1887d6bc3..b37de47662feb 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -15,13 +15,13 @@ See LICENSE for the license #define PY_SSIZE_T_CLEAN #include -#define ERROR_OK 0 #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 #include "../headers/stdint.h" #include "../inline_helper.h" +#include "../headers/portable.h" #include "khash.h" @@ -31,10 +31,6 @@ See LICENSE for the license #define CALLING_READ_FAILED 2 -#if defined(_MSC_VER) -#define strtoll _strtoi64 -#endif // _MSC_VER - /* C flat file parsing low level code for pandas / NumPy @@ -161,9 +157,9 @@ typedef struct parser_t { int64_t skip_footer; // pick one, depending on whether the converter requires GIL double (*double_converter_nogil)(const char *, char **, - char, char, char, int); + char, char, char, int, int *, int *); double (*double_converter_withgil)(const char *, char **, - char, char, char, int); + char, char, char, int, int *, int *); // error handling char *warn_msg; @@ -179,7 +175,6 @@ typedef struct coliter_t { } coliter_t; void coliter_setup(coliter_t *self, parser_t *parser, int i, int start); -coliter_t *coliter_new(parser_t *self, int i); #define COLITER_NEXT(iter, word) \ do { \ diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 48712dc68829d..21f439ec93e0f 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -399,22 +399,6 @@ static void *CLong(JSOBJ obj, JSONTypeContext *tc, void *outValue, return NULL; } -#ifdef _LP64 -static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *((JSINT64 *)outValue) = PyLong_AsLong(obj); - return NULL; -} -#else -static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *((JSINT32 *)outValue) = PyLong_AsLong(obj); - return NULL; -} -#endif - static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { *((JSINT64 *)outValue) = GET_TC(tc)->longValue; diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index f848310d961e1..141735a97938a 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -204,12 +204,12 @@ cpdef assert_almost_equal(a, b, # case for zero if abs(fa) < 1e-5: if not decimal_almost_equal(fa, fb, decimal): - assert False, ('(very low values) expected %.5f but ' - 'got %.5f, with decimal %d' % (fb, fa, decimal)) + assert False, (f'(very low values) expected {fb:.5f} ' + f'but got {fa:.5f}, with decimal {decimal}') else: if not decimal_almost_equal(1, fb / fa, decimal): - assert False, ('expected %.5f but got %.5f, ' - 'with decimal %d' % (fb, fa, decimal)) + assert False, (f'expected {fb:.5f} but got {fa:.5f}, ' + f'with decimal {decimal}') return True raise AssertionError(f"{a} != {b}") diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d101a2976cd55..598def4e1d9fa 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -266,20 +266,16 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, elif basic_format: dt64_to_dtstruct(val, &dts) - res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, - dts.month, - dts.day, - dts.hour, - dts.min, - dts.sec) + res = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} ' + f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}') if show_ns: ns = dts.ps // 1000 - res += '.%.9d' % (ns + 1000 * dts.us) + res += f'.{ns + dts.us * 1000:09d}' elif show_us: - res += '.%.6d' % dts.us + res += f'.{dts.us:06d}' elif show_ms: - res += '.%.3d' % (dts.us /1000) + res += f'.{dts.us // 1000:03d}' result[i] = res diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 8e4143a053ba3..c6c98e996b745 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -55,11 +55,11 @@ def maybe_integer_op_deprecated(obj): # GH#22535 add/sub of integers and int-arrays is deprecated if obj.freq is not None: warnings.warn("Addition/subtraction of integers and integer-arrays " - "to {cls} is deprecated, will be removed in a future " + f"to {type(obj).__name__} is deprecated, " + "will be removed in a future " "version. Instead of adding/subtracting `n`, use " "`n * self.freq`" - .format(cls=type(obj).__name__), - FutureWarning) + , FutureWarning) cdef class _Timestamp(datetime): @@ -144,11 +144,10 @@ cdef class _Timestamp(datetime): # e.g. tzlocal has no `strftime` pass - tz = ", tz='{0}'".format(zone) if zone is not None else "" - freq = "" if self.freq is None else ", freq='{0}'".format(self.freqstr) + tz = f", tz='{zone}'" if zone is not None else "" + freq = "" if self.freq is None else f", freq='{self.freqstr}'" - return "Timestamp('{stamp}'{tz}{freq})".format(stamp=stamp, - tz=tz, freq=freq) + return f"Timestamp('{stamp}'{tz}{freq})" cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: @@ -201,7 +200,7 @@ cdef class _Timestamp(datetime): """ return np.datetime64(self.value, 'ns') - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.datetime64: """ Convert the Timestamp to a NumPy datetime64. @@ -369,29 +368,28 @@ cdef class _Timestamp(datetime): return out[0] @property - def _repr_base(self): - return '{date} {time}'.format(date=self._date_repr, - time=self._time_repr) + def _repr_base(self) -> str: + return f"{self._date_repr} {self._time_repr}" @property - def _date_repr(self): + def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but # the datetime strftime() methods require year >= 1900 - return '%d-%.2d-%.2d' % (self.year, self.month, self.day) + return f'{self.year}-{self.month:02d}-{self.day:02d}' @property - def _time_repr(self): - result = '%.2d:%.2d:%.2d' % (self.hour, self.minute, self.second) + def _time_repr(self) -> str: + result = f'{self.hour:02d}:{self.minute:02d}:{self.second:02d}' if self.nanosecond != 0: - result += '.%.9d' % (self.nanosecond + 1000 * self.microsecond) + result += f'.{self.nanosecond + 1000 * self.microsecond:09d}' elif self.microsecond != 0: - result += '.%.6d' % self.microsecond + result += f'.{self.microsecond:06d}' return result @property - def _short_repr(self): + def _short_repr(self) -> str: # format a Timestamp with only _date_repr if possible # otherwise _repr_base if (self.hour == 0 and @@ -403,7 +401,7 @@ cdef class _Timestamp(datetime): return self._repr_base @property - def asm8(self): + def asm8(self) -> np.datetime64: """ Return numpy datetime64 format in nanoseconds. """ diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index bd74180403ad9..c5315219b8422 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -197,7 +197,7 @@ def datetime_to_datetime64(object[:] values): iresult[i] = pydatetime_to_dt64(val, &dts) check_dts_bounds(&dts) else: - raise TypeError('Unrecognized value type: %s' % type(val)) + raise TypeError(f'Unrecognized value type: {type(val)}') return result, inferred_tz @@ -326,8 +326,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit, raise ValueError("Cannot convert Period to Timestamp " "unambiguously. Use to_timestamp") else: - raise TypeError('Cannot convert input [{}] of type {} to ' - 'Timestamp'.format(ts, type(ts))) + raise TypeError(f'Cannot convert input [{ts}] of type {type(ts)} to ' + f'Timestamp') if tz is not None: localize_tso(obj, tz) @@ -686,7 +686,7 @@ def normalize_date(dt: object) -> datetime: elif PyDate_Check(dt): return datetime(dt.year, dt.month, dt.day) else: - raise TypeError('Unrecognized type: %s' % type(dt)) + raise TypeError(f'Unrecognized type: {type(dt)}') @cython.wraparound(False) diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 8f5c8d10776df..dfed8d06530aa 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -130,7 +130,7 @@ def get_date_name_field(const int64_t[:] dtindex, object field, object locale=No out[i] = names[dts.month].capitalize() else: - raise ValueError("Field {field} not supported".format(field=field)) + raise ValueError(f"Field {field} not supported") return out @@ -165,8 +165,7 @@ def get_start_end_field(const int64_t[:] dtindex, object field, if freqstr: if freqstr == 'C': - raise ValueError("Custom business days is not supported by {field}" - .format(field=field)) + raise ValueError(f"Custom business days is not supported by {field}") is_business = freqstr[0] == 'B' # YearBegin(), BYearBegin() use month = starting month of year. @@ -373,7 +372,7 @@ def get_start_end_field(const int64_t[:] dtindex, object field, out[i] = 1 else: - raise ValueError("Field {field} not supported".format(field=field)) + raise ValueError(f"Field {field} not supported") return out.view(bool) @@ -537,7 +536,7 @@ def get_date_field(const int64_t[:] dtindex, object field): elif field == 'is_leap_year': return isleapyear_arr(get_date_field(dtindex, 'Y')) - raise ValueError("Field {field} not supported".format(field=field)) + raise ValueError(f"Field {field} not supported") @cython.wraparound(False) @@ -653,7 +652,7 @@ def get_timedelta_field(const int64_t[:] tdindex, object field): out[i] = tds.nanoseconds return out - raise ValueError("Field %s not supported" % field) + raise ValueError(f"Field {field} not supported") cpdef isleapyear_arr(ndarray years): diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index b29c841896072..660f4ddcec736 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -197,7 +197,7 @@ cpdef _base_and_stride(str freqstr): groups = opattern.match(freqstr) if not groups: - raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) + raise ValueError(f"Could not evaluate {freqstr}") stride = groups.group(1) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index e491d6111a919..6fab827f1364a 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -95,10 +95,6 @@ cdef class _NaT(datetime): # higher than np.ndarray and np.matrix __array_priority__ = 100 - def __hash__(_NaT self): - # py3k needs this defined here - return hash(self.value) - def __richcmp__(_NaT self, object other, int op): cdef: int ndim = getattr(other, 'ndim', -1) @@ -115,8 +111,8 @@ cdef class _NaT(datetime): if is_datetime64_object(other): return _nat_scalar_rules[op] else: - raise TypeError('Cannot compare type %r with type %r' % - (type(self).__name__, type(other).__name__)) + raise TypeError(f'Cannot compare type {type(self).__name__} ' + f'with type {type(other).__name__}') # Note: instead of passing "other, self, _reverse_ops[op]", we observe # that `_nat_scalar_rules` is invariant under `_reverse_ops`, @@ -150,8 +146,7 @@ cdef class _NaT(datetime): result = np.empty(other.shape, dtype="datetime64[ns]") result.fill("NaT") return result - raise TypeError("Cannot add NaT to ndarray with dtype {dtype}" - .format(dtype=other.dtype)) + raise TypeError(f"Cannot add NaT to ndarray with dtype {other.dtype}") return NotImplemented @@ -203,9 +198,8 @@ cdef class _NaT(datetime): result.fill("NaT") return result - raise TypeError( - "Cannot subtract NaT from ndarray with dtype {dtype}" - .format(dtype=other.dtype)) + raise TypeError(f"Cannot subtract NaT from ndarray with " + f"dtype {other.dtype}") return NotImplemented @@ -230,16 +224,16 @@ cdef class _NaT(datetime): return NotImplemented @property - def asm8(self): + def asm8(self) -> np.datetime64: return np.datetime64(NPY_NAT, 'ns') - def to_datetime64(self): + def to_datetime64(self) -> np.datetime64: """ Return a numpy.datetime64 object with 'ns' precision. """ return np.datetime64('NaT', 'ns') - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.datetime64: """ Convert the Timestamp to a NumPy datetime64. @@ -265,7 +259,7 @@ cdef class _NaT(datetime): def __str__(self) -> str: return 'NaT' - def isoformat(self, sep='T'): + def isoformat(self, sep='T') -> str: # This allows Timestamp(ts.isoformat()) to always correctly roundtrip. return 'NaT' diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index e76f84265a327..b9406074bb130 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -112,11 +112,9 @@ cdef inline check_dts_bounds(npy_datetimestruct *dts): error = True if error: - fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, - dts.day, dts.hour, - dts.min, dts.sec) - raise OutOfBoundsDatetime( - 'Out of bounds nanosecond timestamp: {fmt}'.format(fmt=fmt)) + fmt = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} ' + f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}') + raise OutOfBoundsDatetime(f'Out of bounds nanosecond timestamp: {fmt}') # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 434252677f1a1..68a0a4a403c81 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -66,16 +66,16 @@ need_suffix = ['QS', 'BQ', 'BQS', 'YS', 'AS', 'BY', 'BA', 'BYS', 'BAS'] for __prefix in need_suffix: for _m in MONTHS: - key = '%s-%s' % (__prefix, _m) + key = f'{__prefix}-{_m}' _offset_to_period_map[key] = _offset_to_period_map[__prefix] for __prefix in ['A', 'Q']: for _m in MONTHS: - _alias = '%s-%s' % (__prefix, _m) + _alias = f'{__prefix}-{_m}' _offset_to_period_map[_alias] = _alias for _d in DAYS: - _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d + _offset_to_period_map[f'W-{_d}'] = f'W-{_d}' # --------------------------------------------------------------------- @@ -432,9 +432,9 @@ class _BaseOffset: n_str = "" if self.n != 1: - n_str = "%s * " % self.n + n_str = f"{self.n} * " - out = '<%s' % n_str + className + plural + self._repr_attrs() + '>' + out = f'<{n_str}{className}{plural}{self._repr_attrs()}>' return out def _get_offset_day(self, datetime other): @@ -460,16 +460,13 @@ class _BaseOffset: ValueError if n != int(n) """ if util.is_timedelta64_object(n): - raise TypeError('`n` argument must be an integer, ' - 'got {ntype}'.format(ntype=type(n))) + raise TypeError(f'`n` argument must be an integer, got {type(n)}') try: nint = int(n) except (ValueError, TypeError): - raise TypeError('`n` argument must be an integer, ' - 'got {ntype}'.format(ntype=type(n))) + raise TypeError(f'`n` argument must be an integer, got {type(n)}') if n != nint: - raise ValueError('`n` argument must be an integer, ' - 'got {n}'.format(n=n)) + raise ValueError(f'`n` argument must be an integer, got {n}') return nint def __setstate__(self, state): diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 8fe724fa2f6f7..ecf3e35c86d76 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -153,7 +153,7 @@ cdef inline object _parse_delimited_date(object date_string, bint dayfirst): return datetime_new(year, month, day, 0, 0, 0, 0, None), reso return datetime(year, month, day, 0, 0, 0, 0, None), reso - raise DateParseError("Invalid date specified ({}/{})".format(month, day)) + raise DateParseError(f"Invalid date specified ({month}/{day})") cdef inline bint does_string_look_like_time(object parse_string): @@ -311,7 +311,7 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, # TODO: allow raise of errors within instead raise DateParseError(err) if parsed is None: - raise DateParseError("Could not parse {dstr}".format(dstr=date_string)) + raise DateParseError(f"Could not parse {date_string}") return parsed, parsed, reso @@ -420,18 +420,18 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, raise ValueError if not (1 <= quarter <= 4): - msg = ('Incorrect quarterly string is given, quarter must be ' - 'between 1 and 4: {dstr}') - raise DateParseError(msg.format(dstr=date_string)) + raise DateParseError(f'Incorrect quarterly string is given, ' + f'quarter must be ' + f'between 1 and 4: {date_string}') if freq is not None: # hack attack, #1228 try: mnum = MONTH_NUMBERS[_get_rule_month(freq)] + 1 except (KeyError, ValueError): - msg = ('Unable to retrieve month information from given ' - 'freq: {freq}'.format(freq=freq)) - raise DateParseError(msg) + raise DateParseError(f'Unable to retrieve month ' + f'information from given ' + f'freq: {freq}') month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: @@ -464,7 +464,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, except ValueError: pass - raise ValueError('Unable to parse {0}'.format(date_string)) + raise ValueError(f'Unable to parse {date_string}') cdef dateutil_parse(object timestr, object default, ignoretz=False, @@ -484,8 +484,7 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False, res, _ = res if res is None: - msg = "Unknown datetime string format, unable to parse: {timestr}" - raise ValueError(msg.format(timestr=timestr)) + raise ValueError(f"Unknown datetime string format, unable to parse: {timestr}") for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]: @@ -495,8 +494,7 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False, reso = attr if reso is None: - msg = "Unable to parse datetime string: {timestr}" - raise ValueError(msg.format(timestr=timestr)) + raise ValueError(f"Unable to parse datetime string: {timestr}") if reso == 'microsecond': if repl['microsecond'] == 0: @@ -710,7 +708,7 @@ class _timelex: elif getattr(instream, 'read', None) is None: raise TypeError( 'Parser must be a string or character stream, not ' - '{itype}'.format(itype=instream.__class__.__name__)) + f'{type(instream).__name__}') else: self.stream = instream.read() diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 2512fdb891e3e..80db081a4fc52 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1227,7 +1227,7 @@ def period_format(int64_t value, int freq, object fmt=None): elif freq_group == 12000: # NANOSEC fmt = b'%Y-%m-%d %H:%M:%S.%n' else: - raise ValueError('Unknown freq: {freq}'.format(freq=freq)) + raise ValueError(f'Unknown freq: {freq}') return _period_strftime(value, freq, fmt) @@ -1273,17 +1273,17 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): raise ValueError('Unable to get quarter and year') if i == 0: - repl = '%d' % quarter + repl = str(quarter) elif i == 1: # %f, 2-digit year - repl = '%.2d' % (year % 100) + repl = f'{(year % 100):02d}' elif i == 2: - repl = '%d' % year + repl = str(year) elif i == 3: - repl = '%03d' % (value % 1000) + repl = f'{(value % 1_000):03d}' elif i == 4: - repl = '%06d' % (value % 1000000) + repl = f'{(value % 1_000_000):06d}' elif i == 5: - repl = '%09d' % (value % 1000000000) + repl = f'{(value % 1_000_000_000):09d}' result = result.replace(str_extra_fmts[i], repl) @@ -1391,7 +1391,7 @@ def get_period_field_arr(int code, int64_t[:] arr, int freq): func = _get_accessor_func(code) if func is NULL: - raise ValueError('Unrecognized period code: {code}'.format(code=code)) + raise ValueError(f'Unrecognized period code: {code}') sz = len(arr) out = np.empty(sz, dtype=np.int64) @@ -1578,9 +1578,8 @@ cdef class _Period: freq = to_offset(freq) if freq.n <= 0: - raise ValueError('Frequency must be positive, because it' - ' represents span: {freqstr}' - .format(freqstr=freq.freqstr)) + raise ValueError(f'Frequency must be positive, because it ' + f'represents span: {freq.freqstr}') return freq @@ -1614,9 +1613,8 @@ cdef class _Period: return NotImplemented elif op == Py_NE: return NotImplemented - raise TypeError('Cannot compare type {cls} with type {typ}' - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError(f'Cannot compare type {type(self).__name__} ' + f'with type {type(other).__name__}') def __hash__(self): return hash((self.ordinal, self.freqstr)) @@ -1634,8 +1632,8 @@ cdef class _Period: if nanos % offset_nanos == 0: ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) - msg = 'Input cannot be converted to Period(freq={0})' - raise IncompatibleFrequency(msg.format(self.freqstr)) + raise IncompatibleFrequency(f'Input cannot be converted to ' + f'Period(freq={self.freqstr})') elif util.is_offset_object(other): freqstr = other.rule_code base = get_base_alias(freqstr) @@ -1665,9 +1663,8 @@ cdef class _Period: # GH#17983 sname = type(self).__name__ oname = type(other).__name__ - raise TypeError("unsupported operand type(s) for +: '{self}' " - "and '{other}'".format(self=sname, - other=oname)) + raise TypeError(f"unsupported operand type(s) for +: '{sname}' " + f"and '{oname}'") else: # pragma: no cover return NotImplemented elif is_period_object(other): @@ -2218,7 +2215,7 @@ cdef class _Period: def __repr__(self) -> str: base, mult = get_freq_code(self.freq) formatted = period_format(self.ordinal, base) - return "Period('%s', '%s')" % (formatted, self.freqstr) + return f"Period('{formatted}', '{self.freqstr}')" def __str__(self) -> str: """ @@ -2226,7 +2223,7 @@ cdef class _Period: """ base, mult = get_freq_code(self.freq) formatted = period_format(self.ordinal, base) - value = ("%s" % formatted) + value = str(formatted) return value def __setstate__(self, state): @@ -2477,9 +2474,8 @@ class Period(_Period): try: freq = Resolution.get_freq(reso) except KeyError: - raise ValueError( - "Invalid frequency or could not infer: {reso}" - .format(reso=reso)) + raise ValueError(f"Invalid frequency or could not " + f"infer: {reso}") elif PyDateTime_Check(value): dt = value diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index fbda5f178e164..fda508e51e48f 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -106,11 +106,11 @@ def array_strptime(object[:] values, object fmt, if bad_directive == "\\": bad_directive = "%" del err - raise ValueError("'%s' is a bad directive in format '%s'" % - (bad_directive, fmt)) + raise ValueError(f"'{bad_directive}' is a bad directive " + f"in format '{fmt}'") # IndexError only occurs when the format string is "%" except IndexError: - raise ValueError("stray %% in format '%s'" % fmt) + raise ValueError(f"stray % in format '{fmt}'") _regex_cache[fmt] = format_regex result = np.empty(n, dtype='M8[ns]') @@ -139,14 +139,13 @@ def array_strptime(object[:] values, object fmt, if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match " - "format %r (match)" % (val, fmt)) + raise ValueError(f"time data '{val}' does not match " + f"format '{fmt}' (match)") if len(val) != found.end(): if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("unconverted data remains: %s" % - val[found.end():]) + raise ValueError(f"unconverted data remains: {val[found.end():]}") # search else: @@ -155,8 +154,8 @@ def array_strptime(object[:] values, object fmt, if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match format " - "%r (search)" % (val, fmt)) + raise ValueError(f"time data {repr(val)} does not match format " + f"{repr(fmt)} (search)") iso_year = -1 year = 1900 @@ -589,8 +588,8 @@ class TimeRE(dict): else: return '' regex = '|'.join(re.escape(stuff) for stuff in to_convert) - regex = '(?P<%s>%s' % (directive, regex) - return '%s)' % regex + regex = f'(?P<{directive}>{regex})' + return regex def pattern(self, format): """ @@ -609,11 +608,11 @@ class TimeRE(dict): format = whitespace_replacement.sub(r'\\s+', format) while '%' in format: directive_index = format.index('%') +1 - processed_format = "%s%s%s" % (processed_format, - format[:directive_index -1], - self[format[directive_index]]) + processed_format = (f"{processed_format}" + f"{format[:directive_index -1]}" + f"{self[format[directive_index]]}") format = format[directive_index +1:] - return "%s%s" % (processed_format, format) + return f"{processed_format}{format}" def compile(self, format): """Return a compiled re object for the format string.""" @@ -737,8 +736,7 @@ cdef parse_timezone_directive(str z): z = z[:3] + z[4:] if len(z) > 5: if z[5] != ':': - msg = "Inconsistent use of : in {0}" - raise ValueError(msg.format(z)) + raise ValueError(f"Inconsistent use of : in {z}") z = z[:5] + z[6:] hours = int(z[1:3]) minutes = int(z[3:5]) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 9d8ed62388655..8e5b719749857 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -170,7 +170,7 @@ cdef convert_to_timedelta64(object ts, object unit): if ts.astype('int64') == NPY_NAT: return np.timedelta64(NPY_NAT) elif is_timedelta64_object(ts): - ts = ts.astype("m8[{unit}]".format(unit=unit.lower())) + ts = ts.astype(f"m8[{unit.lower()}]") elif is_integer_object(ts): if ts == NPY_NAT: return np.timedelta64(NPY_NAT) @@ -198,8 +198,7 @@ cdef convert_to_timedelta64(object ts, object unit): if PyDelta_Check(ts): ts = np.timedelta64(delta_to_nanoseconds(ts), 'ns') elif not is_timedelta64_object(ts): - raise ValueError("Invalid type for timedelta " - "scalar: {ts_type}".format(ts_type=type(ts))) + raise ValueError(f"Invalid type for timedelta scalar: {type(ts)}") return ts.astype('timedelta64[ns]') @@ -288,7 +287,7 @@ cpdef inline object precision_from_unit(object unit): m = 1L p = 0 else: - raise ValueError("cannot cast unit {unit}".format(unit=unit)) + raise ValueError(f"cannot cast unit {unit}") return m, p @@ -397,8 +396,7 @@ cdef inline int64_t parse_timedelta_string(str ts) except? -1: result += timedelta_as_neg(r, neg) have_hhmmss = 1 else: - raise ValueError("expecting hh:mm:ss format, " - "received: {ts}".format(ts=ts)) + raise ValueError(f"expecting hh:mm:ss format, received: {ts}") unit, number = [], [] @@ -511,7 +509,7 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): unit = 'm' unit = parse_timedelta_unit(unit) except KeyError: - raise ValueError("invalid abbreviation: {unit}".format(unit=unit)) + raise ValueError(f"invalid abbreviation: {unit}") n = ''.join(number) + '.' + ''.join(frac) return cast_from_unit(float(n), unit) @@ -530,8 +528,7 @@ cpdef inline object parse_timedelta_unit(object unit): try: return timedelta_abbrevs[unit.lower()] except (KeyError, AttributeError): - raise ValueError("invalid unit abbreviation: {unit}" - .format(unit=unit)) + raise ValueError(f"invalid unit abbreviation: {unit}") # ---------------------------------------------------------------------- # Timedelta ops utilities @@ -727,8 +724,7 @@ cdef _to_py_int_float(v): return int(v) elif is_float_object(v): return float(v) - raise TypeError("Invalid type {typ}. Must be int or " - "float.".format(typ=type(v))) + raise TypeError(f"Invalid type {type(v)}. Must be int or float.") # Similar to Timestamp/datetime, this is a construction requirement for @@ -773,10 +769,9 @@ cdef class _Timedelta(timedelta): elif op == Py_NE: return True # only allow ==, != ops - raise TypeError('Cannot compare type {cls} with ' - 'type {other}' - .format(cls=type(self).__name__, - other=type(other).__name__)) + raise TypeError(f'Cannot compare type ' + f'{type(self).__name__} with ' + f'type {type(other).__name__}') if util.is_array(other): return PyObject_RichCompare(np.array([self]), other, op) return PyObject_RichCompare(other, self, reverse_ops[op]) @@ -787,10 +782,8 @@ cdef class _Timedelta(timedelta): return False elif op == Py_NE: return True - raise TypeError('Cannot compare type {cls} with ' - 'type {other}' - .format(cls=type(self).__name__, - other=type(other).__name__)) + raise TypeError(f'Cannot compare type {type(self).__name__} with ' + f'type {type(other).__name__}') return cmp_scalar(self.value, ots.value, op) @@ -841,15 +834,15 @@ cdef class _Timedelta(timedelta): """ return timedelta(microseconds=int(self.value) / 1000) - def to_timedelta64(self): + def to_timedelta64(self) -> np.timedelta64: """ Return a numpy.timedelta64 object with 'ns' precision. """ return np.timedelta64(self.value, 'ns') - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.timedelta64: """ - Convert the Timestamp to a NumPy timedelta64. + Convert the Timedelta to a NumPy timedelta64. .. versionadded:: 0.25.0 @@ -920,7 +913,7 @@ cdef class _Timedelta(timedelta): return self.value @property - def asm8(self): + def asm8(self) -> np.timedelta64: """ Return a numpy timedelta64 array scalar view. @@ -955,7 +948,7 @@ cdef class _Timedelta(timedelta): return np.int64(self.value).view('m8[ns]') @property - def resolution_string(self): + def resolution_string(self) -> str: """ Return a string representing the lowest timedelta resolution. @@ -1095,7 +1088,7 @@ cdef class _Timedelta(timedelta): self._ensure_components() return self._ns - def _repr_base(self, format=None): + def _repr_base(self, format=None) -> str: """ Parameters @@ -1143,15 +1136,16 @@ cdef class _Timedelta(timedelta): return fmt.format(**comp_dict) def __repr__(self) -> str: - return "Timedelta('{val}')".format(val=self._repr_base(format='long')) + repr_based = self._repr_base(format='long') + return f"Timedelta('{repr_based}')" def __str__(self) -> str: return self._repr_base(format='long') - def __bool__(self): + def __bool__(self) -> bool: return self.value != 0 - def isoformat(self): + def isoformat(self) -> str: """ Format Timedelta as ISO 8601 Duration like ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the @@ -1189,14 +1183,14 @@ cdef class _Timedelta(timedelta): 'P500DT12H0MS' """ components = self.components - seconds = '{}.{:0>3}{:0>3}{:0>3}'.format(components.seconds, - components.milliseconds, - components.microseconds, - components.nanoseconds) + seconds = (f'{components.seconds}.' + f'{components.milliseconds:0>3}' + f'{components.microseconds:0>3}' + f'{components.nanoseconds:0>3}') # Trim unnecessary 0s, 1.000000000 -> 1 seconds = seconds.rstrip('0').rstrip('.') - tpl = ('P{td.days}DT{td.hours}H{td.minutes}M{seconds}S' - .format(td=components, seconds=seconds)) + tpl = (f'P{components.days}DT{components.hours}' + f'H{components.minutes}M{seconds}S') return tpl @@ -1276,7 +1270,7 @@ class Timedelta(_Timedelta): value = convert_to_timedelta64(value, 'ns') elif is_timedelta64_object(value): if unit is not None: - value = value.astype('timedelta64[{0}]'.format(unit)) + value = value.astype(f'timedelta64[{unit}]') value = value.astype('timedelta64[ns]') elif hasattr(value, 'delta'): value = np.timedelta64(delta_to_nanoseconds(value.delta), 'ns') @@ -1288,9 +1282,8 @@ class Timedelta(_Timedelta): return NaT else: raise ValueError( - "Value must be Timedelta, string, integer, " - "float, timedelta or convertible, not {typ}" - .format(typ=type(value).__name__)) + f"Value must be Timedelta, string, integer, " + f"float, timedelta or convertible, not {type(value).__name__}") if is_timedelta64_object(value): value = value.view('i8') @@ -1485,9 +1478,7 @@ class Timedelta(_Timedelta): else: return self.to_timedelta64() // other - raise TypeError('Invalid dtype {dtype} for ' - '{op}'.format(dtype=other.dtype, - op='__floordiv__')) + raise TypeError(f'Invalid dtype {other.dtype} for __floordiv__') elif is_integer_object(other) or is_float_object(other): return Timedelta(self.value // other, unit='ns') @@ -1530,9 +1521,7 @@ class Timedelta(_Timedelta): """) warnings.warn(msg, FutureWarning) return other // self.value - raise TypeError('Invalid dtype {dtype} for ' - '{op}'.format(dtype=other.dtype, - op='__floordiv__')) + raise TypeError(f'Invalid dtype {other.dtype} for __floordiv__') elif is_float_object(other) and util.is_nan(other): # i.e. np.nan @@ -1555,8 +1544,7 @@ class Timedelta(_Timedelta): if hasattr(other, 'dtype') and other.dtype.kind == 'i': # TODO: Remove this check with backwards-compat shim # for integer / Timedelta is removed. - raise TypeError("Invalid type {dtype} for " - "{op}".format(dtype=other.dtype, op='__mod__')) + raise TypeError(f'Invalid dtype {other.dtype} for __mod__') return self.__rdivmod__(other)[1] def __divmod__(self, other): @@ -1569,8 +1557,7 @@ class Timedelta(_Timedelta): if hasattr(other, 'dtype') and other.dtype.kind == 'i': # TODO: Remove this check with backwards-compat shim # for integer / Timedelta is removed. - raise TypeError("Invalid type {dtype} for " - "{op}".format(dtype=other.dtype, op='__mod__')) + raise TypeError(f'Invalid dtype {other.dtype} for __mod__') div = other // self return div, other - div * self diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 317dc769636fb..1a278f46a4a2b 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -36,7 +36,6 @@ from pandas._libs.tslibs.tzconversion import ( # Constants _zero_time = datetime_time(0, 0) _no_input = object() -PY36 = sys.version_info >= (3, 6) # ---------------------------------------------------------------------- @@ -371,8 +370,8 @@ class Timestamp(_Timestamp): if tzinfo is not None: if not PyTZInfo_Check(tzinfo): # tzinfo must be a datetime.tzinfo object, GH#17690 - raise TypeError('tzinfo must be a datetime.tzinfo object, ' - 'not %s' % type(tzinfo)) + raise TypeError(f'tzinfo must be a datetime.tzinfo object, ' + f'not {type(tzinfo)}') elif tz is not None: raise ValueError('Can provide at most one of tz, tzinfo') @@ -947,8 +946,8 @@ default 'raise' def validate(k, v): """ validate integers """ if not is_integer_object(v): - raise ValueError("value must be an integer, received " - "{v} for {k}".format(v=type(v), k=k)) + raise ValueError(f"value must be an integer, received " + f"{type(v)} for {k}") return v if year is not None: @@ -982,9 +981,8 @@ default 'raise' else: kwargs = {'year': dts.year, 'month': dts.month, 'day': dts.day, 'hour': dts.hour, 'minute': dts.min, 'second': dts.sec, - 'microsecond': dts.us, 'tzinfo': _tzinfo} - if PY36: - kwargs['fold'] = fold + 'microsecond': dts.us, 'tzinfo': _tzinfo, + 'fold': fold} ts_input = datetime(**kwargs) ts = convert_datetime_to_tsobject(ts_input, _tzinfo) @@ -1005,9 +1003,9 @@ default 'raise' base1, base2 = base, "" if self.microsecond != 0: - base1 += "%.3d" % self.nanosecond + base1 += f"{self.nanosecond:03d}" else: - base1 += ".%.9d" % self.nanosecond + base1 += f".{self.nanosecond:09d}" return base1 + base2 diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index bc1fdfae99de9..35ee87e714fa8 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -280,8 +280,8 @@ def infer_tzinfo(start, end): if start is not None and end is not None: tz = start.tzinfo if not tz_compare(tz, end.tzinfo): - msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' - raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) + raise AssertionError(f'Inputs must both have the same timezone, ' + f'{tz} != {end.tzinfo}') elif start is not None: tz = start.tzinfo elif end is not None: diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index dd0c6fc75b06f..b368f0fde3edc 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -175,8 +175,8 @@ timedelta-like} if trans_idx.size == 1: stamp = _render_tstamp(vals[trans_idx]) raise pytz.AmbiguousTimeError( - "Cannot infer dst time from %s as there " - "are no repeated times".format(stamp)) + f"Cannot infer dst time from {stamp} as there " + f"are no repeated times") # Split the array into contiguous chunks (where the difference between # indices is 1). These are effectively dst transitions in different # years which is useful for checking that there is not an ambiguous @@ -200,8 +200,8 @@ timedelta-like} switch_idx = (delta <= 0).nonzero()[0] if switch_idx.size > 1: raise pytz.AmbiguousTimeError( - "There are %i dst switches when " - "there should only be 1.".format(switch_idx.size)) + f"There are {switch_idx.size} dst switches when " + f"there should only be 1.") switch_idx = switch_idx[0] + 1 # Pull the only index and adjust a_idx = grp[:switch_idx] @@ -230,8 +230,8 @@ timedelta-like} else: stamp = _render_tstamp(val) raise pytz.AmbiguousTimeError( - "Cannot infer dst time from %r, try using the " - "'ambiguous' argument".format(stamp)) + f"Cannot infer dst time from {stamp}, try using the " + f"'ambiguous' argument") elif left != NPY_NAT: result[i] = left elif right != NPY_NAT: @@ -246,8 +246,8 @@ timedelta-like} # time if -1 < shift_delta + remaining_mins < HOURS_NS: raise ValueError( - "The provided timedelta will relocalize on a " - "nonexistent time: {}".format(nonexistent) + f"The provided timedelta will relocalize on a " + f"nonexistent time: {nonexistent}" ) new_local = val + shift_delta elif shift_forward: diff --git a/pandas/_libs/window/__init__.py b/pandas/_libs/window/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window/aggregations.pyx similarity index 66% rename from pandas/_libs/window.pyx rename to pandas/_libs/window/aggregations.pyx index 86b06397123b7..303b4f6f24eac 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -96,280 +96,20 @@ def _check_minp(win, minp, N, floor=None) -> int: # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -# ---------------------------------------------------------------------- -# The indexer objects for rolling -# These define start/end indexers to compute offsets - - -cdef class WindowIndexer: - - cdef: - ndarray start, end - int64_t N, minp, win - bint is_variable - - def get_data(self): - return (self.start, self.end, self.N, - self.win, self.minp, - self.is_variable) - - -cdef class MockFixedWindowIndexer(WindowIndexer): - """ - - We are just checking parameters of the indexer, - and returning a consistent API with fixed/variable - indexers. - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - minp: int64_t - min number of obs in a window to consider non-NaN - index: object - index of the values - floor: optional - unit for flooring - left_closed: bint - left endpoint closedness - right_closed: bint - right endpoint closedness - - """ - def __init__(self, ndarray values, int64_t win, int64_t minp, - bint left_closed, bint right_closed, - object index=None, object floor=None): - - assert index is None - self.is_variable = 0 - self.N = len(values) - self.minp = _check_minp(win, minp, self.N, floor=floor) - self.start = np.empty(0, dtype='int64') - self.end = np.empty(0, dtype='int64') - self.win = win - - -cdef class FixedWindowIndexer(WindowIndexer): - """ - create a fixed length window indexer object - that has start & end, that point to offsets in - the index object; these are defined based on the win - arguments - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - minp: int64_t - min number of obs in a window to consider non-NaN - index: object - index of the values - floor: optional - unit for flooring the unit - left_closed: bint - left endpoint closedness - right_closed: bint - right endpoint closedness - - """ - def __init__(self, ndarray values, int64_t win, int64_t minp, - bint left_closed, bint right_closed, - object index=None, object floor=None): - cdef: - ndarray[int64_t] start_s, start_e, end_s, end_e - - assert index is None - self.is_variable = 0 - self.N = len(values) - self.minp = _check_minp(win, minp, self.N, floor=floor) - - start_s = np.zeros(win, dtype='int64') - start_e = np.arange(win, self.N, dtype='int64') - win + 1 - self.start = np.concatenate([start_s, start_e]) - - end_s = np.arange(win, dtype='int64') + 1 - end_e = start_e + win - self.end = np.concatenate([end_s, end_e]) - self.win = win - - -cdef class VariableWindowIndexer(WindowIndexer): - """ - create a variable length window indexer object - that has start & end, that point to offsets in - the index object; these are defined based on the win - arguments - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - minp: int64_t - min number of obs in a window to consider non-NaN - index: ndarray - index of the values - left_closed: bint - left endpoint closedness - True if the left endpoint is closed, False if open - right_closed: bint - right endpoint closedness - True if the right endpoint is closed, False if open - floor: optional - unit for flooring the unit - """ - def __init__(self, ndarray values, int64_t win, int64_t minp, - bint left_closed, bint right_closed, ndarray index, - object floor=None): - - self.is_variable = 1 - self.N = len(index) - self.minp = _check_minp(win, minp, self.N, floor=floor) - - self.start = np.empty(self.N, dtype='int64') - self.start.fill(-1) - - self.end = np.empty(self.N, dtype='int64') - self.end.fill(-1) - - self.build(index, win, left_closed, right_closed) - - # max window size - self.win = (self.end - self.start).max() - - def build(self, const int64_t[:] index, int64_t win, bint left_closed, - bint right_closed): - - cdef: - ndarray[int64_t] start, end - int64_t start_bound, end_bound, N - Py_ssize_t i, j - - start = self.start - end = self.end - N = self.N - - start[0] = 0 - - # right endpoint is closed - if right_closed: - end[0] = 1 - # right endpoint is open - else: - end[0] = 0 - - with nogil: - - # start is start of slice interval (including) - # end is end of slice interval (not including) - for i in range(1, N): - end_bound = index[i] - start_bound = index[i] - win - - # left endpoint is closed - if left_closed: - start_bound -= 1 - - # advance the start bound until we are - # within the constraint - start[i] = i - for j in range(start[i - 1], i): - if index[j] > start_bound: - start[i] = j - break - - # end bound is previous end - # or current index - if index[end[i - 1]] <= end_bound: - end[i] = i + 1 - else: - end[i] = end[i - 1] - - # right endpoint is open - if not right_closed: - end[i] -= 1 - - -def get_window_indexer(values, win, minp, index, closed, - floor=None, use_mock=True): - """ - Return the correct window indexer for the computation. - - Parameters - ---------- - values: 1d ndarray - win: integer, window size - minp: integer, minimum periods - index: 1d ndarray, optional - index to the values array - closed: string, default None - {'right', 'left', 'both', 'neither'} - window endpoint closedness. Defaults to 'right' in - VariableWindowIndexer and to 'both' in FixedWindowIndexer - floor: optional - unit for flooring the unit - use_mock: boolean, default True - if we are a fixed indexer, return a mock indexer - instead of the FixedWindow Indexer. This is a type - compat Indexer that allows us to use a standard - code path with all of the indexers. - - Returns - ------- - tuple of 1d int64 ndarrays of the offsets & data about the window - - """ - - cdef: - bint left_closed = False - bint right_closed = False - - assert closed is None or closed in ['right', 'left', 'both', 'neither'] - - # if windows is variable, default is 'right', otherwise default is 'both' - if closed is None: - closed = 'right' if index is not None else 'both' - - if closed in ['right', 'both']: - right_closed = True - - if closed in ['left', 'both']: - left_closed = True - - if index is not None: - indexer = VariableWindowIndexer(values, win, minp, left_closed, - right_closed, index, floor) - elif use_mock: - indexer = MockFixedWindowIndexer(values, win, minp, left_closed, - right_closed, index, floor) - else: - indexer = FixedWindowIndexer(values, win, minp, left_closed, - right_closed, index, floor) - return indexer.get_data() - # ---------------------------------------------------------------------- # Rolling count # this is only an impl for index not None, IOW, freq aware -def roll_count(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_count(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp): cdef: float64_t val, count_x = 0.0 - int64_t s, e, nobs, N + int64_t s, e, nobs, N = len(values) Py_ssize_t i, j - int64_t[:] start, end ndarray[float64_t] output - start, end, N, win, minp, _ = get_window_indexer(values, win, - minp, index, closed) output = np.empty(N, dtype=float) with nogil: @@ -442,80 +182,75 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogi sum_x[0] = sum_x[0] - val -def roll_sum(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev_x, sum_x = 0 - int64_t s, e, range_endpoint - int64_t nobs = 0, i, j, N - bint is_variable - int64_t[:] start, end + float64_t sum_x = 0 + int64_t s, e + int64_t nobs = 0, i, j, N = len(values) ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed, - floor=0) output = np.empty(N, dtype=float) - # for performance we are going to iterate - # fixed windows separately, makes the code more complex as we have 2 paths - # but is faster + with nogil: - if is_variable: + for i in range(0, N): + s = start[i] + e = end[i] - # variable window - with nogil: + if i == 0: - for i in range(0, N): - s = start[i] - e = end[i] + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + add_sum(values[j], &nobs, &sum_x) - if i == 0: + else: - # setup - sum_x = 0.0 - nobs = 0 - for j in range(s, e): - add_sum(values[j], &nobs, &sum_x) + # calculate deletes + for j in range(start[i - 1], s): + remove_sum(values[j], &nobs, &sum_x) - else: + # calculate adds + for j in range(end[i - 1], e): + add_sum(values[j], &nobs, &sum_x) - # calculate deletes - for j in range(start[i - 1], s): - remove_sum(values[j], &nobs, &sum_x) + output[i] = calc_sum(minp, nobs, sum_x) - # calculate adds - for j in range(end[i - 1], e): - add_sum(values[j], &nobs, &sum_x) + return output - output[i] = calc_sum(minp, nobs, sum_x) - else: +def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): + cdef: + float64_t val, prev_x, sum_x = 0 + int64_t range_endpoint + int64_t nobs = 0, i, N = len(values) + ndarray[float64_t] output - # fixed window + output = np.empty(N, dtype=float) - range_endpoint = int_max(minp, 1) - 1 + range_endpoint = int_max(minp, 1) - 1 - with nogil: + with nogil: - for i in range(0, range_endpoint): - add_sum(values[i], &nobs, &sum_x) - output[i] = NaN + for i in range(0, range_endpoint): + add_sum(values[i], &nobs, &sum_x) + output[i] = NaN - for i in range(range_endpoint, N): - val = values[i] - add_sum(val, &nobs, &sum_x) + for i in range(range_endpoint, N): + val = values[i] + add_sum(val, &nobs, &sum_x) - if i > win - 1: - prev_x = values[i - win] - remove_sum(prev_x, &nobs, &sum_x) + if i > win - 1: + prev_x = values[i - win] + remove_sum(prev_x, &nobs, &sum_x) - output[i] = calc_sum(minp, nobs, sum_x) + output[i] = calc_sum(minp, nobs, sum_x) return output - # ---------------------------------------------------------------------- # Rolling mean @@ -563,77 +298,75 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, neg_ct[0] = neg_ct[0] - 1 -def roll_mean(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: - float64_t val, prev_x, result, sum_x = 0 - int64_t s, e - bint is_variable - Py_ssize_t nobs = 0, i, j, neg_ct = 0, N - int64_t[:] start, end + float64_t val, prev_x, sum_x = 0 + Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values) ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) - # for performance we are going to iterate - # fixed windows separately, makes the code more complex as we have 2 paths - # but is faster + with nogil: + for i in range(minp - 1): + val = values[i] + add_mean(val, &nobs, &sum_x, &neg_ct) + output[i] = NaN + + for i in range(minp - 1, N): + val = values[i] + add_mean(val, &nobs, &sum_x, &neg_ct) - if is_variable: + if i > win - 1: + prev_x = values[i - win] + remove_mean(prev_x, &nobs, &sum_x, &neg_ct) - with nogil: + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) - for i in range(0, N): - s = start[i] - e = end[i] + return output - if i == 0: - # setup - sum_x = 0.0 - nobs = 0 - for j in range(s, e): - val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) +def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + cdef: + float64_t val, sum_x = 0 + int64_t s, e + Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values) + ndarray[float64_t] output - else: + output = np.empty(N, dtype=float) - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct) + with nogil: - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) + for i in range(0, N): + s = start[i] + e = end[i] - output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + if i == 0: - else: + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + val = values[j] + add_mean(val, &nobs, &sum_x, &neg_ct) - with nogil: - for i in range(minp - 1): - val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) - output[i] = NaN + else: - for i in range(minp - 1, N): - val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + remove_mean(val, &nobs, &sum_x, &neg_ct) - if i > win - 1: - prev_x = values[i - win] - remove_mean(prev_x, &nobs, &sum_x, &neg_ct) + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + add_mean(val, &nobs, &sum_x, &neg_ct) - output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) return output - # ---------------------------------------------------------------------- # Rolling variance @@ -696,8 +429,8 @@ cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x, ssqdm_x[0] = 0 -def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed, int ddof=1): +def roll_var_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win, int ddof=1): """ Numerically stable implementation using Welford's method. """ @@ -705,98 +438,102 @@ def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp, float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, float64_t val, prev, delta, mean_x_old int64_t s, e - bint is_variable - Py_ssize_t i, j, N - int64_t[:] start, end + Py_ssize_t i, j, N = len(values) ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) # Check for windows larger than array, addresses #7297 win = min(win, N) - # for performance we are going to iterate - # fixed windows separately, makes the code more complex as we - # have 2 paths but is faster + with nogil: - if is_variable: + # Over the first window, observations can only be added, never + # removed + for i in range(win): + add_var(values[i], &nobs, &mean_x, &ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) - with nogil: + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - for i in range(0, N): + # After the first window, observations can both be added and + # removed + for i in range(win, N): + val = values[i] + prev = values[i - win] - s = start[i] - e = end[i] + if notnan(val): + if prev == prev: - # Over the first window, observations can only be added - # never removed - if i == 0: + # Adding one observation and removing another one + delta = val - prev + mean_x_old = mean_x - for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) + mean_x += delta / nobs + ssqdm_x += ((nobs - 1) * val + + (nobs + 1) * prev + - 2 * nobs * mean_x_old) * delta / nobs else: + add_var(val, &nobs, &mean_x, &ssqdm_x) + elif prev == prev: + remove_var(prev, &nobs, &mean_x, &ssqdm_x) - # After the first window, observations can both be added - # and removed + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + return output - # calculate adds - for j in range(end[i - 1], e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) - # calculate deletes - for j in range(start[i - 1], s): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x) +def roll_var_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int ddof=1): + """ + Numerically stable implementation using Welford's method. + """ + cdef: + float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, + float64_t val, prev, delta, mean_x_old + int64_t s, e + Py_ssize_t i, j, N = len(values) + ndarray[float64_t] output - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + output = np.empty(N, dtype=float) - else: + with nogil: - with nogil: + for i in range(0, N): - # Over the first window, observations can only be added, never - # removed - for i in range(win): - add_var(values[i], &nobs, &mean_x, &ssqdm_x) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + s = start[i] + e = end[i] - # a part of Welford's method for the online variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + # Over the first window, observations can only be added + # never removed + if i == 0: - # After the first window, observations can both be added and - # removed - for i in range(win, N): - val = values[i] - prev = values[i - win] + for j in range(s, e): + add_var(values[j], &nobs, &mean_x, &ssqdm_x) - if notnan(val): - if prev == prev: + else: - # Adding one observation and removing another one - delta = val - prev - mean_x_old = mean_x + # After the first window, observations can both be added + # and removed - mean_x += delta / nobs - ssqdm_x += ((nobs - 1) * val - + (nobs + 1) * prev - - 2 * nobs * mean_x_old) * delta / nobs + # calculate adds + for j in range(end[i - 1], e): + add_var(values[j], &nobs, &mean_x, &ssqdm_x) - else: - add_var(val, &nobs, &mean_x, &ssqdm_x) - elif prev == prev: - remove_var(prev, &nobs, &mean_x, &ssqdm_x) + # calculate deletes + for j in range(start[i - 1], s): + remove_var(values[j], &nobs, &mean_x, &ssqdm_x) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) return output - # ---------------------------------------------------------------------- # Rolling skewness + cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, float64_t x, float64_t xx, float64_t xxx) nogil: @@ -861,76 +598,80 @@ cdef inline void remove_skew(float64_t val, int64_t *nobs, xxx[0] = xxx[0] - val * val * val -def roll_skew(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_skew_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, prev float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N + int64_t nobs = 0, i, j, N = len(values) int64_t s, e - bint is_variable - int64_t[:] start, end ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) - if is_variable: + with nogil: + for i in range(minp - 1): + val = values[i] + add_skew(val, &nobs, &x, &xx, &xxx) + output[i] = NaN - with nogil: + for i in range(minp - 1, N): + val = values[i] + add_skew(val, &nobs, &x, &xx, &xxx) - for i in range(0, N): + if i > win - 1: + prev = values[i - win] + remove_skew(prev, &nobs, &x, &xx, &xxx) - s = start[i] - e = end[i] + output[i] = calc_skew(minp, nobs, x, xx, xxx) - # Over the first window, observations can only be added - # never removed - if i == 0: + return output - for j in range(s, e): - val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) - else: +def roll_skew_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + cdef: + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0 + int64_t nobs = 0, i, j, N = len(values) + int64_t s, e + ndarray[float64_t] output + + output = np.empty(N, dtype=float) - # After the first window, observations can both be added - # and removed + with nogil: - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) + for i in range(0, N): - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - remove_skew(val, &nobs, &x, &xx, &xxx) + s = start[i] + e = end[i] - output[i] = calc_skew(minp, nobs, x, xx, xxx) + # Over the first window, observations can only be added + # never removed + if i == 0: - else: + for j in range(s, e): + val = values[j] + add_skew(val, &nobs, &x, &xx, &xxx) - with nogil: - for i in range(minp - 1): - val = values[i] - add_skew(val, &nobs, &x, &xx, &xxx) - output[i] = NaN + else: - for i in range(minp - 1, N): - val = values[i] - add_skew(val, &nobs, &x, &xx, &xxx) + # After the first window, observations can both be added + # and removed - if i > win - 1: - prev = values[i - win] - remove_skew(prev, &nobs, &x, &xx, &xxx) + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + add_skew(val, &nobs, &x, &xx, &xxx) - output[i] = calc_skew(minp, nobs, x, xx, xxx) + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + remove_skew(val, &nobs, &x, &xx, &xxx) - return output + output[i] = calc_skew(minp, nobs, x, xx, xxx) + return output # ---------------------------------------------------------------------- # Rolling kurtosis @@ -1005,69 +746,73 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, xxxx[0] = xxxx[0] - val * val * val * val -def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_kurt_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, prev float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, N + int64_t nobs = 0, i, j, N = len(values) int64_t s, e - bint is_variable - int64_t[:] start, end ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) - if is_variable: + with nogil: - with nogil: + for i in range(minp - 1): + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) + output[i] = NaN - for i in range(0, N): + for i in range(minp - 1, N): + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) - s = start[i] - e = end[i] + if i > win - 1: + prev = values[i - win] + remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) - # Over the first window, observations can only be added - # never removed - if i == 0: + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) - for j in range(s, e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + return output - else: - # After the first window, observations can both be added - # and removed +def roll_kurt_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + cdef: + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 + int64_t nobs = 0, i, j, s, e, N = len(values) + ndarray[float64_t] output - # calculate adds - for j in range(end[i - 1], e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + output = np.empty(N, dtype=float) - # calculate deletes - for j in range(start[i - 1], s): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + with nogil: - output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + for i in range(0, N): - else: + s = start[i] + e = end[i] - with nogil: + # Over the first window, observations can only be added + # never removed + if i == 0: - for i in range(minp - 1): - add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) - output[i] = NaN + for j in range(s, e): + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) - for i in range(minp - 1, N): - add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) + else: - if i > win - 1: - prev = values[i - win] - remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) + # After the first window, observations can both be added + # and removed - output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + # calculate adds + for j in range(end[i - 1], e): + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) return output @@ -1076,31 +821,26 @@ def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp, # Rolling median, min, max -def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, res, prev - bint err = 0, is_variable + bint err = 0 int ret = 0 skiplist_t *sl Py_ssize_t i, j - int64_t nobs = 0, N, s, e + int64_t nobs = 0, N = len(values), s, e int midpoint - int64_t[:] start, end ndarray[float64_t] output # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs - start, end, N, win, minp, is_variable = get_window_indexer( - values, win, - minp, index, closed, - use_mock=False) output = np.empty(N, dtype=float) - if win == 0: + if win == 0 or (end - start).max() == 0: output[:] = NaN return output - + win = (end - start).max() sl = skiplist_init(win) if sl == NULL: raise MemoryError("skiplist_init failed") @@ -1209,76 +949,89 @@ cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, return result -def roll_max(ndarray[numeric] values, int64_t win, int64_t minp, - object index, object closed): +def roll_max_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - values: numpy array - window: int, size of rolling window - minp: if number of observations in window + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window is below this, output a NaN - index: ndarray, optional + index : ndarray, optional index for window computation - closed: 'right', 'left', 'both', 'neither' + closed : 'right', 'left', 'both', 'neither' make the interval closed on the right, left, both or neither endpoints """ - return _roll_min_max(values, win, minp, index, closed=closed, is_max=1) + return _roll_min_max_fixed(values, start, end, minp, win, is_max=1) -def roll_min(ndarray[numeric] values, int64_t win, int64_t minp, - object index, object closed): +def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - values: numpy array - window: int, size of rolling window - minp: if number of observations in window + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window is below this, output a NaN - index: ndarray, optional + index : ndarray, optional index for window computation + closed : 'right', 'left', 'both', 'neither' + make the interval closed on the right, left, + both or neither endpoints """ - return _roll_min_max(values, win, minp, index, is_max=0, closed=closed) + return _roll_min_max_variable(values, start, end, minp, is_max=1) -cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, - object index, object closed, bint is_max): +def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): """ - Moving min/max of 1d array of any numeric type along axis=0 - ignoring NaNs. + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window + is below this, output a NaN + index : ndarray, optional + index for window computation """ - cdef: - ndarray[int64_t] starti, endi - int64_t N - bint is_variable + return _roll_min_max_fixed(values, start, end, minp, win, is_max=0) - starti, endi, N, win, minp, is_variable = get_window_indexer( - values, win, - minp, index, closed) - if is_variable: - return _roll_min_max_variable(values, starti, endi, N, win, minp, - is_max) - else: - return _roll_min_max_fixed(values, N, win, minp, is_max) +def roll_min_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + """ + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window + is below this, output a NaN + index : ndarray, optional + index for window computation + """ + return _roll_min_max_variable(values, start, end, minp, is_max=0) cdef _roll_min_max_variable(ndarray[numeric] values, ndarray[int64_t] starti, ndarray[int64_t] endi, - int64_t N, - int64_t win, int64_t minp, bint is_max): cdef: numeric ai int64_t i, close_offset, curr_win_size - Py_ssize_t nobs = 0 + Py_ssize_t nobs = 0, N = len(values) deque Q[int64_t] # min/max always the front deque W[int64_t] # track the whole window for nobs compute ndarray[float64_t, ndim=1] output @@ -1353,15 +1106,16 @@ cdef _roll_min_max_variable(ndarray[numeric] values, cdef _roll_min_max_fixed(ndarray[numeric] values, - int64_t N, - int64_t win, + ndarray[int64_t] starti, + ndarray[int64_t] endi, int64_t minp, + int64_t win, bint is_max): cdef: numeric ai bint should_replace int64_t i, removed, window_i, - Py_ssize_t nobs = 0 + Py_ssize_t nobs = 0, N = len(values) int64_t* death numeric* ring numeric* minvalue @@ -1457,8 +1211,8 @@ interpolation_types = { } -def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, - int64_t minp, object index, object closed, +def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win, float64_t quantile, str interpolation): """ O(N log(window)) implementation using skip list @@ -1466,10 +1220,8 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, cdef: float64_t val, prev, midpoint, idx_with_fraction skiplist_t *skiplist - int64_t nobs = 0, i, j, s, e, N + int64_t nobs = 0, i, j, s, e, N = len(values) Py_ssize_t idx - bint is_variable - int64_t[:] start, end ndarray[float64_t] output float64_t vlow, vhigh InterpolationType interpolation_type @@ -1485,16 +1237,12 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs - start, end, N, win, minp, is_variable = get_window_indexer( - values, win, - minp, index, closed, - use_mock=False) output = np.empty(N, dtype=float) - if win == 0: + if win == 0 or (end - start).max() == 0: output[:] = NaN return output - + win = (end - start).max() skiplist = skiplist_init(win) if skiplist == NULL: raise MemoryError("skiplist_init failed") @@ -1575,18 +1323,17 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, return output -def roll_generic(object obj, - int64_t win, int64_t minp, object index, object closed, - int offset, object func, bint raw, - object args, object kwargs): +def roll_generic_fixed(object obj, + ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp, int64_t win, + int offset, object func, bint raw, + object args, object kwargs): cdef: ndarray[float64_t] output, counts, bufarr ndarray[float64_t, cast=True] arr float64_t *buf float64_t *oldbuf - int64_t nobs = 0, i, j, s, e, N - bint is_variable - int64_t[:] start, end + int64_t nobs = 0, i, j, s, e, N = len(start) n = len(obj) if n == 0: @@ -1599,36 +1346,13 @@ def roll_generic(object obj, if not arr.flags.c_contiguous: arr = arr.copy('C') - counts = roll_sum(np.concatenate([np.isfinite(arr).astype(float), - np.array([0.] * offset)]), - win, minp, index, closed)[offset:] - - start, end, N, win, minp, is_variable = get_window_indexer(arr, win, - minp, index, - closed, - floor=0) + counts = roll_sum_fixed(np.concatenate([np.isfinite(arr).astype(float), + np.array([0.] * offset)]), + start, end, minp, win)[offset:] output = np.empty(N, dtype=float) - if is_variable: - # variable window arr or series - - if offset != 0: - raise ValueError("unable to roll_generic with a non-zero offset") - - for i in range(0, N): - s = start[i] - e = end[i] - - if counts[i] >= minp: - if raw: - output[i] = func(arr[s:e], *args, **kwargs) - else: - output[i] = func(obj.iloc[s:e], *args, **kwargs) - else: - output[i] = NaN - - elif not raw: + if not raw: # series for i in range(N): if counts[i] >= minp: @@ -1672,6 +1396,53 @@ def roll_generic(object obj, return output +def roll_generic_variable(object obj, + ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp, + int offset, object func, bint raw, + object args, object kwargs): + cdef: + ndarray[float64_t] output, counts, bufarr + ndarray[float64_t, cast=True] arr + float64_t *buf + float64_t *oldbuf + int64_t nobs = 0, i, j, s, e, N = len(start) + + n = len(obj) + if n == 0: + return obj + + arr = np.asarray(obj) + + # ndarray input + if raw: + if not arr.flags.c_contiguous: + arr = arr.copy('C') + + counts = roll_sum_variable(np.concatenate([np.isfinite(arr).astype(float), + np.array([0.] * offset)]), + start, end, minp)[offset:] + + output = np.empty(N, dtype=float) + + if offset != 0: + raise ValueError("unable to roll_generic with a non-zero offset") + + for i in range(0, N): + s = start[i] + e = end[i] + + if counts[i] >= minp: + if raw: + output[i] = func(arr[s:e], *args, **kwargs) + else: + output[i] = func(obj.iloc[s:e], *args, **kwargs) + else: + output[i] = NaN + + return output + + # ---------------------------------------------------------------------- # Rolling sum and mean for weighted window @@ -1914,7 +1685,7 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights, values: float64_t[:] values to roll window over weights: float64_t[:] - array of weights whose lenght is window size + array of weights whose length is window size minp: int64_t minimum number of observations to calculate variance of a window diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx new file mode 100644 index 0000000000000..eab9f0f8aab43 --- /dev/null +++ b/pandas/_libs/window/indexers.pyx @@ -0,0 +1,140 @@ +# cython: boundscheck=False, wraparound=False, cdivision=True + +from typing import Tuple + +import numpy as np +from numpy cimport ndarray, int64_t + +# ---------------------------------------------------------------------- +# The indexer objects for rolling +# These define start/end indexers to compute offsets + + +class FixedWindowIndexer: + """ + create a fixed length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments + + Parameters + ---------- + values: ndarray + values data array + win: int64_t + window size + index: object + index of the values + closed: string + closed behavior + """ + def __init__(self, ndarray values, int64_t win, object closed, object index=None): + cdef: + ndarray[int64_t, ndim=1] start_s, start_e, end_s, end_e + int64_t N = len(values) + + start_s = np.zeros(win, dtype='int64') + start_e = np.arange(win, N, dtype='int64') - win + 1 + self.start = np.concatenate([start_s, start_e])[:N] + + end_s = np.arange(win, dtype='int64') + 1 + end_e = start_e + win + self.end = np.concatenate([end_s, end_e])[:N] + + def get_window_bounds(self) -> Tuple[np.ndarray, np.ndarray]: + return self.start, self.end + + +class VariableWindowIndexer: + """ + create a variable length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments + + Parameters + ---------- + values: ndarray + values data array + win: int64_t + window size + index: ndarray + index of the values + closed: string + closed behavior + """ + def __init__(self, ndarray values, int64_t win, object closed, ndarray index): + cdef: + bint left_closed = False + bint right_closed = False + int64_t N = len(index) + + # if windows is variable, default is 'right', otherwise default is 'both' + if closed is None: + closed = 'right' if index is not None else 'both' + + if closed in ['right', 'both']: + right_closed = True + + if closed in ['left', 'both']: + left_closed = True + + self.start, self.end = self.build(index, win, left_closed, right_closed, N) + + @staticmethod + def build(const int64_t[:] index, int64_t win, bint left_closed, + bint right_closed, int64_t N) -> Tuple[np.ndarray, np.ndarray]: + + cdef: + ndarray[int64_t] start, end + int64_t start_bound, end_bound + Py_ssize_t i, j + + start = np.empty(N, dtype='int64') + start.fill(-1) + end = np.empty(N, dtype='int64') + end.fill(-1) + + start[0] = 0 + + # right endpoint is closed + if right_closed: + end[0] = 1 + # right endpoint is open + else: + end[0] = 0 + + with nogil: + + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, N): + end_bound = index[i] + start_bound = index[i] - win + + # left endpoint is closed + if left_closed: + start_bound -= 1 + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + if index[j] > start_bound: + start[i] = j + break + + # end bound is previous end + # or current index + if index[end[i - 1]] <= end_bound: + end[i] = i + 1 + else: + end[i] = end[i - 1] + + # right endpoint is open + if not right_closed: + end[i] -= 1 + return start, end + + def get_window_bounds(self) -> Tuple[np.ndarray, np.ndarray]: + return self.start, self.end diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 1775893b9f2bf..73201e75c3c88 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -70,7 +70,7 @@ def write_csv_rows(list data, ndarray data_index, @cython.boundscheck(False) @cython.wraparound(False) -def convert_json_to_lines(object arr): +def convert_json_to_lines(arr: object) -> str: """ replace comma separated json with line feeds, paying special attention to quotes & brackets diff --git a/pandas/_version.py b/pandas/_version.py index 0cdedf3da3ea7..dfed9574c7cb0 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -47,7 +47,7 @@ class NotThisMethod(Exception): pass -HANDLERS = {} # type: Dict[str, Dict[str, Callable]] +HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 9870b5bed076d..61832a8b6d621 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,6 +4,7 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( + BooleanArray, Categorical, DatetimeArray, IntegerArray, @@ -16,6 +17,7 @@ ) __all__ = [ + "BooleanArray", "Categorical", "DatetimeArray", "IntegerArray", diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 890d0aca0019d..f95dd8679308f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -12,7 +12,6 @@ import sys import warnings -PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) PY38 = sys.version_info >= (3, 8) PYPY = platform.python_implementation() == "PyPy" @@ -31,7 +30,7 @@ def set_function_name(f, name, cls): Bind the name/qualname attributes of the function. """ f.__name__ = name - f.__qualname__ = "{klass}.{name}".format(klass=cls.__name__, name=name) + f.__qualname__ = f"{cls.__name__}.{name}" f.__module__ = cls.__module__ return f diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index cd4e1b7e8aa4d..bfe31c6a1d794 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -8,7 +8,7 @@ VERSIONS = { "bs4": "4.6.0", "bottleneck": "1.2.1", - "fastparquet": "0.2.1", + "fastparquet": "0.3.2", "gcsfs": "0.2.2", "lxml.etree": "3.8.0", "matplotlib": "2.2.2", @@ -18,7 +18,8 @@ "pandas_gbq": "0.8.0", "pyarrow": "0.9.0", "pytables": "3.4.2", - "s3fs": "0.0.8", + "pytest": "5.0.1", + "s3fs": "0.3.0", "scipy": "0.19.0", "sqlalchemy": "1.1.4", "tables": "3.4.2", @@ -28,15 +29,6 @@ "xlsxwriter": "0.9.8", } -message = ( - "Missing optional dependency '{name}'. {extra} " - "Use pip or conda to install {name}." -) -version_message = ( - "Pandas requires version '{minimum_version}' or newer of '{name}' " - "(version '{actual_version}' currently installed)." -) - def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) @@ -45,7 +37,7 @@ def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__VERSION__", None) if version is None: - raise ImportError("Can't determine version for {}".format(module.__name__)) + raise ImportError(f"Can't determine version for {module.__name__}") return version @@ -86,11 +78,15 @@ def import_optional_dependency( is False, or when the package's version is too old and `on_version` is ``'warn'``. """ + msg = ( + f"Missing optional dependency '{name}'. {extra} " + f"Use pip or conda to install {name}." + ) try: module = importlib.import_module(name) except ImportError: if raise_on_missing: - raise ImportError(message.format(name=name, extra=extra)) from None + raise ImportError(msg) from None else: return None @@ -99,8 +95,9 @@ def import_optional_dependency( version = _get_version(module) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} - msg = version_message.format( - minimum_version=minimum_version, name=name, actual_version=version + msg = ( + f"Pandas requires version '{minimum_version}' or newer of '{name}' " + f"(version '{version}' currently installed)." ) if on_version == "warn": warnings.warn(msg, UserWarning) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 402ed62f2df65..27f1c32058941 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -18,11 +18,11 @@ if _nlv < "1.13.3": raise ImportError( - "this version of pandas is incompatible with " - "numpy < 1.13.3\n" - "your numpy version is {0}.\n" - "Please upgrade numpy to >= 1.13.3 to use " - "this pandas version".format(_np_version) + f"this version of pandas is incompatible with " + f"numpy < 1.13.3\n" + f"your numpy version is {_np_version}.\n" + f"Please upgrade numpy to >= 1.13.3 to use " + f"this pandas version" ) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index c2fe7d1dd12f4..fffe09a74571e 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -58,9 +58,7 @@ def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=No fname, args, kwargs, max_fname_arg_count, self.defaults ) else: - raise ValueError( - "invalid validation method '{method}'".format(method=method) - ) + raise ValueError(f"invalid validation method '{method}'") ARGMINMAX_DEFAULTS = dict(out=None) @@ -108,7 +106,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): return skipna -ARGSORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]] +ARGSORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None @@ -124,7 +122,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): # two different signatures of argsort, this second validation # for when the `kind` param is supported -ARGSORT_DEFAULTS_KIND = OrderedDict() # type: OrderedDict[str, Optional[int]] +ARGSORT_DEFAULTS_KIND: "OrderedDict[str, Optional[int]]" = OrderedDict() ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None validate_argsort_kind = CompatValidator( @@ -171,14 +169,14 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -COMPRESS_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] +COMPRESS_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() COMPRESS_DEFAULTS["axis"] = None COMPRESS_DEFAULTS["out"] = None validate_compress = CompatValidator( COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1 ) -CUM_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] +CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None validate_cum_func = CompatValidator( @@ -204,7 +202,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): return skipna -ALLANY_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]] +ALLANY_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() ALLANY_DEFAULTS["dtype"] = None ALLANY_DEFAULTS["out"] = None ALLANY_DEFAULTS["keepdims"] = False @@ -226,28 +224,28 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 ) -RESHAPE_DEFAULTS = dict(order="C") # type: Dict[str, str] +RESHAPE_DEFAULTS: Dict[str, str] = dict(order="C") validate_reshape = CompatValidator( RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 ) -REPEAT_DEFAULTS = dict(axis=None) # type: Dict[str, Any] +REPEAT_DEFAULTS: Dict[str, Any] = dict(axis=None) validate_repeat = CompatValidator( REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 ) -ROUND_DEFAULTS = dict(out=None) # type: Dict[str, Any] +ROUND_DEFAULTS: Dict[str, Any] = dict(out=None) validate_round = CompatValidator( ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) -SORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]] +SORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() SORT_DEFAULTS["axis"] = -1 SORT_DEFAULTS["kind"] = "quicksort" SORT_DEFAULTS["order"] = None validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") -STAT_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Any]] +STAT_FUNC_DEFAULTS: "OrderedDict[str, Optional[Any]]" = OrderedDict() STAT_FUNC_DEFAULTS["dtype"] = None STAT_FUNC_DEFAULTS["out"] = None @@ -275,13 +273,13 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 ) -STAT_DDOF_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]] +STAT_DDOF_FUNC_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() STAT_DDOF_FUNC_DEFAULTS["dtype"] = None STAT_DDOF_FUNC_DEFAULTS["out"] = None STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") -TAKE_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[str]] +TAKE_DEFAULTS: "OrderedDict[str, Optional[str]]" = OrderedDict() TAKE_DEFAULTS["out"] = None TAKE_DEFAULTS["mode"] = "raise" validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") @@ -312,9 +310,8 @@ def validate_take_with_convert(convert, args, kwargs): def validate_window_func(name, args, kwargs): numpy_args = ("axis", "dtype", "out") msg = ( - "numpy operations are not " - "valid with window objects. " - "Use .{func}() directly instead ".format(func=name) + f"numpy operations are not valid with window objects. " + f"Use .{name}() directly instead " ) if len(args) > 0: @@ -328,9 +325,8 @@ def validate_window_func(name, args, kwargs): def validate_rolling_func(name, args, kwargs): numpy_args = ("axis", "dtype", "out") msg = ( - "numpy operations are not " - "valid with window objects. " - "Use .rolling(...).{func}() instead ".format(func=name) + f"numpy operations are not valid with window objects. " + f"Use .rolling(...).{name}() instead " ) if len(args) > 0: @@ -344,9 +340,8 @@ def validate_rolling_func(name, args, kwargs): def validate_expanding_func(name, args, kwargs): numpy_args = ("axis", "dtype", "out") msg = ( - "numpy operations are not " - "valid with window objects. " - "Use .expanding(...).{func}() instead ".format(func=name) + f"numpy operations are not valid with window objects. " + f"Use .expanding(...).{name}() instead " ) if len(args) > 0: @@ -371,11 +366,9 @@ def validate_groupby_func(name, args, kwargs, allowed=None): if len(args) + len(kwargs) > 0: raise UnsupportedFunctionCall( - ( - "numpy operations are not valid " - "with groupby. Use .groupby(...)." - "{func}() instead".format(func=name) - ) + f"numpy operations are not valid with " + f"groupby. Use .groupby(...).{name}() " + f"instead" ) @@ -391,11 +384,9 @@ def validate_resampler_func(method, args, kwargs): if len(args) + len(kwargs) > 0: if method in RESAMPLER_NUMPY_OPS: raise UnsupportedFunctionCall( - ( - "numpy operations are not valid " - "with resample. Use .resample(...)." - "{func}() instead".format(func=method) - ) + f"numpy operations are not " + f"valid with resample. Use " + f".resample(...).{method}() instead" ) else: raise TypeError("too many arguments passed in") @@ -418,7 +409,4 @@ def validate_minmax_axis(axis): if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): - raise ValueError( - "`axis` must be fewer than the number of " - "dimensions ({ndim})".format(ndim=ndim) - ) + raise ValueError(f"`axis` must be fewer than the number of dimensions ({ndim})") diff --git a/pandas/conftest.py b/pandas/conftest.py index b032e14d8f7e1..78e5b5e12b7e9 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -293,6 +293,20 @@ def compare_operators_no_eq_ne(request): return request.param +@pytest.fixture( + params=["__and__", "__rand__", "__or__", "__ror__", "__xor__", "__rxor__"] +) +def all_logical_operators(request): + """ + Fixture for dunder names for common logical operations + + * | + * & + * ^ + """ + return request.param + + @pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) def compression(request): """ diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index fc60c01d7b808..182b07d57ea49 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -11,8 +11,8 @@ class DirNamesMixin: - _accessors = set() # type: Set[str] - _deprecations = frozenset() # type: FrozenSet[str] + _accessors: Set[str] = set() + _deprecations: FrozenSet[str] = frozenset() def _dir_deletions(self): """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 23675752a4593..18adb12a9ad72 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -10,7 +10,7 @@ from pandas._libs import Timestamp, algos, hashtable as htable, lib from pandas._libs.tslib import iNaT -from pandas.util._decorators import Appender, Substitution, deprecate_kwarg +from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, @@ -50,7 +50,7 @@ from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices -_shared_docs = {} # type: Dict[str, str] +_shared_docs: Dict[str, str] = {} # --------------- # @@ -109,7 +109,7 @@ def _ensure_data(values, dtype=None): except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype - # and it is incompat this will fall thru to here + # and it is incompat this will fall through to here return ensure_object(values), "object" # datetimelike @@ -448,9 +448,11 @@ def isin(comps, values) -> np.ndarray: return f(comps, values) -def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=None): +def _factorize_array( + values, na_sentinel: int = -1, size_hint=None, na_value=None +) -> Tuple[np.ndarray, np.ndarray]: """ - Factorize an array-like to labels and uniques. + Factorize an array-like to codes and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -468,18 +470,16 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non Returns ------- - labels : ndarray + codes : ndarray uniques : ndarray """ hash_klass, values = _get_data_algo(values) table = hash_klass(size_hint or len(values)) - uniques, labels = table.factorize( - values, na_sentinel=na_sentinel, na_value=na_value - ) + uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) - labels = ensure_platform_int(labels) - return labels, uniques + codes = ensure_platform_int(codes) + return codes, uniques _shared_docs[ @@ -494,7 +494,7 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non Parameters ---------- - %(values)s%(sort)s%(order)s + %(values)s%(sort)s na_sentinel : int, default -1 Value to mark "not found". %(size_hint)s\ @@ -585,14 +585,6 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non coerced to ndarrays before factorization. """ ), - order=dedent( - """\ - order : None - .. deprecated:: 0.23.0 - - This parameter has no effect and is deprecated. - """ - ), sort=dedent( """\ sort : bool, default False @@ -608,13 +600,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non ), ) @Appender(_shared_docs["factorize"]) -@deprecate_kwarg(old_arg_name="order", new_arg_name=None) def factorize( - values, - sort: bool = False, - order=None, - na_sentinel: int = -1, - size_hint: Optional[int] = None, + values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None, ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -1159,7 +1146,7 @@ def compute(self, method): n = min(n, narr) kth_val = algos.kth_smallest(arr.copy(), n - 1) - ns, = np.nonzero(arr <= kth_val) + (ns,) = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind="mergesort")] if self.keep != "all": @@ -1924,33 +1911,34 @@ def diff(arr, n: int, axis: int = 0): # this module. def safe_sort( values, - labels=None, + codes=None, na_sentinel: int = -1, assume_unique: bool = False, verify: bool = True, -): +) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: """ - Sort ``values`` and reorder corresponding ``labels``. - ``values`` should be unique if ``labels`` is not None. + Sort ``values`` and reorder corresponding ``codes``. + + ``values`` should be unique if ``codes`` is not None. Safe for use with mixed types (int, str), orders ints before strs. Parameters ---------- values : list-like - Sequence; must be unique if ``labels`` is not None. - labels : list_like + Sequence; must be unique if ``codes`` is not None. + codes : list_like, optional Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 - Value in ``labels`` to mark "not found". - Ignored when ``labels`` is None. + Value in ``codes`` to mark "not found". + Ignored when ``codes`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up - the calculation. Ignored when ``labels`` is None. + the calculation. Ignored when ``codes`` is None. verify : bool, default True - Check if labels are out of bound for the values and put out of bound - labels equal to na_sentinel. If ``verify=False``, it is assumed there - are no out of bound labels. Ignored when ``labels`` is None. + Check if codes are out of bound for the values and put out of bound + codes equal to na_sentinel. If ``verify=False``, it is assumed there + are no out of bound codes. Ignored when ``codes`` is None. .. versionadded:: 0.25.0 @@ -1958,17 +1946,17 @@ def safe_sort( ------- ordered : ndarray Sorted ``values`` - new_labels : ndarray - Reordered ``labels``; returned when ``labels`` is not None. + new_codes : ndarray + Reordered ``codes``; returned when ``codes`` is not None. Raises ------ TypeError - * If ``values`` is not list-like or if ``labels`` is neither None + * If ``values`` is not list-like or if ``codes`` is neither None nor list-like * If ``values`` cannot be sorted ValueError - * If ``labels`` is not None and ``values`` contain duplicates. + * If ``codes`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError( @@ -2002,22 +1990,22 @@ def sort_mixed(values): # try this anyway ordered = sort_mixed(values) - # labels: + # codes: - if labels is None: + if codes is None: return ordered - if not is_list_like(labels): + if not is_list_like(codes): raise TypeError( "Only list-like objects or None are allowed to be" - "passed to safe_sort as labels" + "passed to safe_sort as codes" ) - labels = ensure_platform_int(np.asarray(labels)) + codes = ensure_platform_int(np.asarray(codes)) from pandas import Index if not assume_unique and not Index(values).is_unique: - raise ValueError("values should be unique if labels is not None") + raise ValueError("values should be unique if codes is not None") if sorter is None: # mixed types @@ -2029,9 +2017,9 @@ def sort_mixed(values): if na_sentinel == -1: # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() - new_labels = take_1d(order2, labels, fill_value=-1) + new_codes = take_1d(order2, codes, fill_value=-1) if verify: - mask = (labels < -len(values)) | (labels >= len(values)) + mask = (codes < -len(values)) | (codes >= len(values)) else: mask = None else: @@ -2039,13 +2027,13 @@ def sort_mixed(values): reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` - new_labels = reverse_indexer.take(labels, mode="wrap") + new_codes = reverse_indexer.take(codes, mode="wrap") - mask = labels == na_sentinel + mask = codes == na_sentinel if verify: - mask = mask | (labels < -len(values)) | (labels >= len(values)) + mask = mask | (codes < -len(values)) | (codes >= len(values)) if mask is not None: - np.putmask(new_labels, mask, na_sentinel) + np.putmask(new_codes, mask, na_sentinel) - return ordered, ensure_platform_int(new_labels) + return ordered, ensure_platform_int(new_codes) diff --git a/pandas/core/api.py b/pandas/core/api.py index 04f2f84c92a15..65f0178b19187 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -10,9 +10,9 @@ ) from pandas.core.dtypes.missing import isna, isnull, notna, notnull -# TODO: Remove get_dummies import when statsmodels updates #18264 from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.arrays import Categorical +from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -45,7 +45,6 @@ from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range from pandas.core.indexing import IndexSlice -from pandas.core.reshape.reshape import get_dummies from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d9f6bdae288ed..8c49b2b803241 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,4 +1,6 @@ +import abc import inspect +from typing import TYPE_CHECKING, Any, Dict, Iterator, Tuple, Type, Union import numpy as np @@ -11,22 +13,28 @@ is_list_like, is_sequence, ) -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries + +if TYPE_CHECKING: + from pandas import DataFrame, Series, Index + +ResType = Dict[int, Any] def frame_apply( - obj, + obj: "DataFrame", func, axis=0, - raw=False, + raw: bool = False, result_type=None, - ignore_failures=False, + ignore_failures: bool = False, args=None, kwds=None, ): """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) + klass: Type[FrameApply] if axis == 0: klass = FrameRowApply elif axis == 1: @@ -43,8 +51,45 @@ def frame_apply( ) -class FrameApply: - def __init__(self, obj, func, raw, result_type, ignore_failures, args, kwds): +class FrameApply(metaclass=abc.ABCMeta): + + # --------------------------------------------------------------- + # Abstract Methods + axis: int + + @property + @abc.abstractmethod + def result_index(self) -> "Index": + pass + + @property + @abc.abstractmethod + def result_columns(self) -> "Index": + pass + + @property + @abc.abstractmethod + def series_generator(self) -> Iterator["Series"]: + pass + + @abc.abstractmethod + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: + pass + + # --------------------------------------------------------------- + + def __init__( + self, + obj: "DataFrame", + func, + raw: bool, + result_type, + ignore_failures: bool, + args, + kwds, + ): self.obj = obj self.raw = raw self.ignore_failures = ignore_failures @@ -70,17 +115,16 @@ def f(x): self.f = f - # results - self.result = None - self.res_index = None - self.res_columns = None + @property + def res_columns(self) -> "Index": + return self.result_columns @property - def columns(self): + def columns(self) -> "Index": return self.obj.columns @property - def index(self): + def index(self) -> "Index": return self.obj.index @cache_readonly @@ -88,11 +132,11 @@ def values(self): return self.obj.values @cache_readonly - def dtypes(self): + def dtypes(self) -> "Series": return self.obj.dtypes @property - def agg_axis(self): + def agg_axis(self) -> "Index": return self.obj._get_agg_axis(self.axis) def get_result(self): @@ -127,7 +171,7 @@ def get_result(self): # broadcasting if self.result_type == "broadcast": - return self.apply_broadcast() + return self.apply_broadcast(self.obj) # one axis empty elif not all(self.obj.shape): @@ -183,6 +227,8 @@ def apply_raw(self): if "Function does not reduce" not in str(err): # catch only ValueError raised intentionally in libreduction raise + # We expect np.apply_along_axis to give a two-dimensional result, or + # also raise. result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case @@ -191,7 +237,7 @@ def apply_raw(self): else: return self.obj._constructor_sliced(result, index=self.agg_axis) - def apply_broadcast(self, target): + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": result_values = np.empty_like(target.values) # axis which we want to compare compliance @@ -231,7 +277,7 @@ def apply_standard(self): and not self.dtypes.apply(is_extension_array_dtype).any() # Disallow complex_internals since libreduction shortcut # cannot handle MultiIndex - and not self.agg_axis._has_complex_internals + and not isinstance(self.agg_axis, ABCMultiIndex) ): values = self.values @@ -263,12 +309,12 @@ def apply_standard(self): return self.obj._constructor_sliced(result, index=labels) # compute the result using the series generator - self.apply_series_generator() + results, res_index = self.apply_series_generator() # wrap results - return self.wrap_results() + return self.wrap_results(results, res_index) - def apply_series_generator(self): + def apply_series_generator(self) -> Tuple[ResType, "Index"]: series_gen = self.series_generator res_index = self.result_index @@ -295,21 +341,20 @@ def apply_series_generator(self): results[i] = self.f(v) keys.append(v.name) - self.results = results - self.res_index = res_index - self.res_columns = self.result_columns + return results, res_index - def wrap_results(self): - results = self.results + def wrap_results( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: # see if we can infer the results if len(results) > 0 and 0 in results and is_sequence(results[0]): - return self.wrap_results_for_axis() + return self.wrap_results_for_axis(results, res_index) # dict of scalars result = self.obj._constructor_sliced(results) - result.index = self.res_index + result.index = res_index return result @@ -317,33 +362,34 @@ def wrap_results(self): class FrameRowApply(FrameApply): axis = 0 - def apply_broadcast(self): - return super().apply_broadcast(self.obj) + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": + return super().apply_broadcast(target) @property def series_generator(self): return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) @property - def result_index(self): + def result_index(self) -> "Index": return self.columns @property - def result_columns(self): + def result_columns(self) -> "Index": return self.index - def wrap_results_for_axis(self): + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> "DataFrame": """ return the results for the rows """ - results = self.results result = self.obj._constructor(data=results) if not isinstance(results[0], ABCSeries): if len(result.index) == len(self.res_columns): result.index = self.res_columns - if len(result.columns) == len(self.res_index): - result.columns = self.res_index + if len(result.columns) == len(res_index): + result.columns = res_index return result @@ -351,8 +397,8 @@ def wrap_results_for_axis(self): class FrameColumnApply(FrameApply): axis = 1 - def apply_broadcast(self): - result = super().apply_broadcast(self.obj.T) + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": + result = super().apply_broadcast(target.T) return result.T @property @@ -364,43 +410,44 @@ def series_generator(self): ) @property - def result_index(self): + def result_index(self) -> "Index": return self.index @property - def result_columns(self): + def result_columns(self) -> "Index": return self.columns - def wrap_results_for_axis(self): + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: """ return the results for the columns """ - results = self.results + result: Union["Series", "DataFrame"] # we have requested to expand if self.result_type == "expand": - result = self.infer_to_same_shape() + result = self.infer_to_same_shape(results, res_index) # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): from pandas import Series result = Series(results) - result.index = self.res_index + result.index = res_index # we may want to infer results else: - result = self.infer_to_same_shape() + result = self.infer_to_same_shape(results, res_index) return result - def infer_to_same_shape(self): + def infer_to_same_shape(self, results: ResType, res_index: "Index") -> "DataFrame": """ infer the results to the same shape as the input object """ - results = self.results result = self.obj._constructor(data=results) result = result.T # set the index - result.index = self.res_index + result.index = res_index # infer dtypes result = result.infer_objects() diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 03d998707c26b..df26cd94b5ed9 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -4,6 +4,7 @@ ExtensionScalarOpsMixin, try_cast_to_ea, ) +from .boolean import BooleanArray # noqa: F401 from .categorical import Categorical # noqa: F401 from .datetimes import DatetimeArray # noqa: F401 from .integer import IntegerArray, integer_array # noqa: F401 diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 82dabe735581b..a444a4e46d0d7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -29,7 +29,7 @@ _not_implemented_message = "{} does not implement {}." -_extension_array_shared_docs = dict() # type: Dict[str, str] +_extension_array_shared_docs: Dict[str, str] = dict() def try_cast_to_ea(cls_or_instance, obj, dtype=None): @@ -1088,6 +1088,15 @@ def _add_comparison_ops(cls): cls.__le__ = cls._create_comparison_method(operator.le) cls.__ge__ = cls._create_comparison_method(operator.ge) + @classmethod + def _add_logical_ops(cls): + cls.__and__ = cls._create_logical_method(operator.and_) + cls.__rand__ = cls._create_logical_method(ops.rand_) + cls.__or__ = cls._create_logical_method(operator.or_) + cls.__ror__ = cls._create_logical_method(ops.ror_) + cls.__xor__ = cls._create_logical_method(operator.xor) + cls.__rxor__ = cls._create_logical_method(ops.rxor) + class ExtensionScalarOpsMixin(ExtensionOpsMixin): """ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py new file mode 100644 index 0000000000000..c118b6fe26549 --- /dev/null +++ b/pandas/core/arrays/boolean.py @@ -0,0 +1,745 @@ +import numbers +from typing import TYPE_CHECKING, Type +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas.compat import set_function_name + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna, notna + +from pandas.core import nanops, ops +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +@register_extension_dtype +class BooleanDtype(ExtensionDtype): + """ + Extension dtype for boolean data. + + .. versionadded:: 1.0.0 + + .. warning:: + + BooleanDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.BooleanDtype() + BooleanDtype + """ + + @property + def na_value(self) -> "Scalar": + """ + BooleanDtype uses :attr:`numpy.nan` as the missing NA value. + + .. warning:: + + `na_value` may change in a future release. + """ + return np.nan + + @property + def type(self) -> Type: + return np.bool_ + + @property + def kind(self) -> str: + return "b" + + @property + def name(self) -> str: + """ + The alias for BooleanDtype is ``'boolean'``. + """ + return "boolean" + + @classmethod + def construct_from_string(cls, string: str) -> ExtensionDtype: + if string == "boolean": + return cls() + return super().construct_from_string(string) + + @classmethod + def construct_array_type(cls) -> "Type[BooleanArray]": + return BooleanArray + + def __repr__(self) -> str: + return "BooleanDtype" + + @property + def _is_boolean(self) -> bool: + return True + + +def coerce_to_array(values, mask=None, copy: bool = False): + """ + Coerce the input values array to numpy arrays with a mask. + + Parameters + ---------- + values : 1D list-like + mask : bool 1D array, optional + copy : bool, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + if isinstance(values, BooleanArray): + if mask is not None: + raise ValueError("cannot pass mask for BooleanArray input") + values, mask = values._data, values._mask + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + mask_values = None + if isinstance(values, np.ndarray) and values.dtype == np.bool_: + if copy: + values = values.copy() + else: + # TODO conversion from integer/float ndarray can be done more efficiently + # (avoid roundtrip through object) + values_object = np.asarray(values, dtype=object) + + inferred_dtype = lib.infer_dtype(values_object, skipna=True) + integer_like = ("floating", "integer", "mixed-integer-float") + if inferred_dtype not in ("boolean", "empty") + integer_like: + raise TypeError("Need to pass bool-like values") + + mask_values = isna(values_object) + values = np.zeros(len(values), dtype=bool) + values[~mask_values] = values_object[~mask_values].astype(bool) + + # if the values were integer-like, validate it were actually 0/1's + if inferred_dtype in integer_like: + if not np.all( + values[~mask_values].astype(float) + == values_object[~mask_values].astype(float) + ): + raise TypeError("Need to pass bool-like values") + + if mask is None and mask_values is None: + mask = np.zeros(len(values), dtype=bool) + elif mask is None: + mask = mask_values + else: + if isinstance(mask, np.ndarray) and mask.dtype == np.bool_: + if mask_values is not None: + mask = mask | mask_values + else: + if copy: + mask = mask.copy() + else: + mask = np.array(mask, dtype=bool) + if mask_values is not None: + mask = mask | mask_values + + if not values.ndim == 1: + raise ValueError("values must be a 1D list-like") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D list-like") + + return values, mask + + +class BooleanArray(ExtensionArray, ExtensionOpsMixin): + """ + Array of boolean (True/False) data with missing values. + + This is a pandas Extension array for boolean data, under the hood + represented by 2 numpy arrays: a boolean array with the data and + a boolean array with the mask (True indicating missing). + + To construct an BooleanArray from generic array-like input, use + :func:`pandas.array` specifying ``dtype="boolean"`` (see examples + below). + + .. versionadded:: 1.0.0 + + .. warning:: + + BooleanArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : numpy.ndarray + A 1-d boolean-dtype array with the data. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values (True + indicates missing). + copy : bool, default False + Whether to copy the `values` and `mask` arrays. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + BooleanArray + + Examples + -------- + Create an BooleanArray with :func:`pandas.array`: + + >>> pd.array([True, False, None], dtype="boolean") + + [True, False, NaN] + Length: 3, dtype: boolean + """ + + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): + raise TypeError( + "values should be boolean numpy array. Use " + "the 'array' function instead" + ) + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'array' function instead" + ) + if not values.ndim == 1: + raise ValueError("values must be a 1D array") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D array") + + if copy: + values = values.copy() + mask = mask.copy() + + self._data = values + self._mask = mask + self._dtype = BooleanDtype() + + @property + def dtype(self): + return self._dtype + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy: bool = False): + if dtype: + assert dtype == "boolean" + values, mask = coerce_to_array(scalars, copy=copy) + return BooleanArray(values, mask) + + @classmethod + def _from_factorized(cls, values, original: "BooleanArray"): + return cls._from_sequence(values, dtype=original.dtype) + + def _formatter(self, boxed=False): + def fmt(x): + if isna(x): + return "NaN" + return str(x) + + return fmt + + def __getitem__(self, item): + if is_integer(item): + if self._mask[item]: + return self.dtype.na_value + return self._data[item] + return type(self)(self._data[item], self._mask[item]) + + def _coerce_to_ndarray(self, force_bool: bool = False): + """ + Coerce to an ndarary of object dtype or bool dtype (if force_bool=True). + + Parameters + ---------- + force_bool : bool, default False + If True, return bool array or raise error if not possible (in + presence of missing values) + """ + if force_bool: + if not self.isna().any(): + return self._data + else: + raise ValueError( + "cannot convert to bool numpy array in presence of missing values" + ) + data = self._data.astype(object) + data[self._mask] = self._na_value + return data + + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + + def __array__(self, dtype=None): + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + if dtype is not None: + if is_bool_dtype(dtype): + return self._coerce_to_ndarray(force_bool=True) + # TODO can optimize this to not go through object dtype for + # numeric dtypes + arr = self._coerce_to_ndarray() + return arr.astype(dtype, copy=False) + # by default (no dtype specified), return an object array + return self._coerce_to_ndarray() + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + + _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # For BooleanArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == "reduce": + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (BooleanArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, BooleanArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + if is_bool_dtype(x.dtype): + m = mask.copy() + return BooleanArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def __iter__(self): + for i in range(len(self)): + if self._mask[i]: + yield self.dtype.na_value + else: + yield self._data[i] + + def take(self, indexer, allow_fill=False, fill_value=None): + # we always fill with False internally + # to avoid upcasting + data_fill_value = False if isna(fill_value) else fill_value + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) + + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return type(self)(result, mask, copy=False) + + def copy(self): + data, mask = self._data, self._mask + data = data.copy() + mask = mask.copy() + return type(self)(data, mask, copy=False) + + def __setitem__(self, key, value): + _is_scalar = is_scalar(value) + if _is_scalar: + value = [value] + value, mask = coerce_to_array(value) + + if _is_scalar: + value = value[0] + mask = mask[0] + + self._data[key] = value + self._mask[key] = mask + + def __len__(self): + return len(self._data) + + @property + def nbytes(self): + return self._data.nbytes + self._mask.nbytes + + def isna(self): + return self._mask + + @property + def _na_value(self): + return self._dtype.na_value + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x._data for x in to_concat]) + mask = np.concatenate([x._mask for x in to_concat]) + return cls(data, mask) + + def astype(self, dtype, copy=True): + """ + Cast to a NumPy array or ExtensionArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray or ExtensionArray + NumPy ndarray, BooleanArray or IntergerArray with 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an BooleanDtype, equivalent of same_kind + casting + """ + dtype = pandas_dtype(dtype) + + if isinstance(dtype, BooleanDtype): + values, mask = coerce_to_array(self, copy=copy) + return BooleanArray(values, mask, copy=False) + + if is_bool_dtype(dtype): + # astype_nansafe converts np.nan to True + if self.isna().any(): + raise ValueError("cannot convert float NaN to bool") + else: + return self._data.astype(dtype, copy=copy) + if is_extension_array_dtype(dtype) and is_integer_dtype(dtype): + from pandas.core.arrays import IntegerArray + + return IntegerArray( + self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False + ) + # coerce + data = self._coerce_to_ndarray() + return astype_nansafe(data, dtype, copy=None) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each category. + + Every category will have an entry, even those with a count of 0. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + + """ + + from pandas import Index, Series + + # compute counts on the data with no nans + data = self._data[~self._mask] + value_counts = Index(data).value_counts() + array = value_counts.values + + # TODO(extension) + # if we have allow Index to hold an ExtensionArray + # this is easier + index = value_counts.index.values.astype(bool).astype(object) + + # if we want nans, count the mask + if not dropna: + + # TODO(extension) + # appending to an Index *always* infers + # w/o passing the dtype + array = np.append(array, [self._mask.sum()]) + index = Index( + np.concatenate([index, np.array([np.nan], dtype=object)]), dtype=object + ) + + return Series(array, index=index) + + def _values_for_argsort(self) -> np.ndarray: + """ + Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort + """ + data = self._data.copy() + data[self._mask] = -1 + return data + + @classmethod + def _create_logical_method(cls, op): + def logical_method(self, other): + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + elif is_list_like(other): + other = np.asarray(other, dtype="bool") + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + other, mask = coerce_to_array(other, copy=False) + + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + return BooleanArray(result, mask) + + name = "__{name}__".format(name=op.__name__) + return set_function_name(logical_method, name, cls) + + @classmethod + def _create_comparison_method(cls, op): + op_name = op.__name__ + + def cmp_method(self, other): + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + result[mask] = op_name == "ne" + return BooleanArray(result, np.zeros(len(result), dtype=bool), copy=False) + + name = "__{name}__".format(name=op.__name__) + return set_function_name(cmp_method, name, cls) + + def _reduce(self, name, skipna=True, **kwargs): + data = self._data + mask = self._mask + + # coerce to a nan-aware float if needed + if mask.any(): + data = self._data.astype("float64") + data[mask] = self._na_value + + op = getattr(nanops, "nan" + name) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + + # if we have a boolean op, don't coerce + if name in ["any", "all"]: + pass + + # if we have numeric op that would result in an int, coerce to int if possible + elif name in ["sum", "prod"] and notna(result): + int_result = np.int64(result) + if int_result == result: + result = int_result + + elif name in ["min", "max"] and notna(result): + result = np.bool_(result) + + return result + + def _maybe_mask_result(self, result, mask, other, op_name): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + if (is_float_dtype(other) or is_float(other)) or ( + op_name in ["rtruediv", "truediv"] + ): + result[mask] = np.nan + return result + + if is_bool_dtype(result): + return BooleanArray(result, mask, copy=False) + + elif is_integer_dtype(result): + from pandas.core.arrays import IntegerArray + + return IntegerArray(result, mask, copy=False) + else: + result[mask] = np.nan + return result + + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + def boolean_arithmetic_method(self, other): + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = "__{name}__".format(name=op_name) + return set_function_name(boolean_arithmetic_method, name, cls) + + +BooleanArray._add_logical_ops() +BooleanArray._add_comparison_ops() +BooleanArray._add_arithmetic_ops() diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 39470c7420086..ca9ec2fd63165 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -8,7 +8,7 @@ from pandas._config import get_option -from pandas._libs import algos as libalgos, hashtable as htable, lib +from pandas._libs import algos as libalgos, hashtable as htable from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -39,7 +39,7 @@ needs_i8_conversion, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna @@ -52,6 +52,7 @@ import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array from pandas.core.missing import interpolate_2d +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.io.formats import console @@ -72,18 +73,10 @@ def _cat_compare_op(op): - opname = "__{op}__".format(op=op.__name__) - - def f(self, other): - # On python2, you can usually compare any type to any type, and - # Categoricals can be seen as a custom type, but having different - # results depending whether categories are the same or not is kind of - # insane, so be a bit stricter here and use the python3 idea of - # comparing only things of equal type. - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - - other = lib.item_from_zerodim(other) + opname = f"__{op.__name__}__" + + @unpack_zerodim_and_defer(opname) + def func(self, other): if is_list_like(other) and len(other) != len(self): # TODO: Could this fail if the categories are listlike objects? raise ValueError("Lengths must match.") @@ -134,15 +127,14 @@ def f(self, other): return ret else: if opname == "__eq__": - return np.repeat(False, len(self)) + return np.zeros(len(self), dtype=bool) elif opname == "__ne__": - return np.repeat(True, len(self)) + return np.ones(len(self), dtype=bool) else: - msg = ( - "Cannot compare a Categorical for op {op} with a " + raise TypeError( + f"Cannot compare a Categorical for op {opname} with a " "scalar, which is not a category." ) - raise TypeError(msg.format(op=opname)) else: # allow categorical vs object dtype array comparisons for equality @@ -150,16 +142,15 @@ def f(self, other): if opname in ["__eq__", "__ne__"]: return getattr(np.array(self), opname)(np.array(other)) - msg = ( - "Cannot compare a Categorical for op {op} with type {typ}." - "\nIf you want to compare values, use 'np.asarray(cat) " - " other'." + raise TypeError( + f"Cannot compare a Categorical for op {opname} with " + f"type {type(other)}.\nIf you want to compare values, " + "use 'np.asarray(cat) other'." ) - raise TypeError(msg.format(op=opname, typ=type(other))) - f.__name__ = opname + func.__name__ = opname - return f + return func def contains(cat, key, container): @@ -843,8 +834,8 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes, for example when using special string - dtypes on python3, which does not considers a S1 string equal to a - single char python string. + dtypes, which does not considers a S1 string equal to a single char + python string. Parameters ---------- @@ -1061,11 +1052,9 @@ def add_categories(self, new_categories, inplace=False): new_categories = [new_categories] already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: - msg = ( - "new categories must not include old categories: " - "{already_included!s}" + raise ValueError( + f"new categories must not include old categories: {already_included}" ) - raise ValueError(msg.format(already_included=already_included)) new_categories = list(self.dtype.categories) + list(new_categories) new_dtype = CategoricalDtype(new_categories, self.ordered) @@ -1111,7 +1100,7 @@ def remove_categories(self, removals, inplace=False): if not is_list_like(removals): removals = [removals] - removal_set = set(list(removals)) + removal_set = set(removals) not_included = removal_set - set(self.dtype.categories) new_categories = [c for c in self.dtype.categories if c not in removal_set] @@ -1121,8 +1110,7 @@ def remove_categories(self, removals, inplace=False): new_categories = [x for x in new_categories if notna(x)] if len(not_included) != 0: - msg = "removals must all be in old categories: {not_included!s}" - raise ValueError(msg.format(not_included=not_included)) + raise ValueError(f"removals must all be in old categories: {not_included}") return self.set_categories( new_categories, ordered=self.ordered, rename=False, inplace=inplace @@ -1300,9 +1288,8 @@ def shift(self, periods, fill_value=None): fill_value = self.categories.get_loc(fill_value) else: raise ValueError( - "'fill_value={}' is not present " - "in this Categorical's " - "categories".format(fill_value) + f"'fill_value={fill_value}' is not present " + "in this Categorical's categories" ) if periods > 0: codes[:periods] = fill_value @@ -1343,8 +1330,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for all other cases, raise for now (similarly as what happens in # Series.__array_prepare__) raise TypeError( - "Object with dtype {dtype} cannot perform " - "the numpy op {op}".format(dtype=self.dtype, op=ufunc.__name__) + f"Object with dtype {self.dtype} cannot perform " + f"the numpy op {ufunc.__name__}" ) def __setstate__(self, state): @@ -1543,9 +1530,9 @@ def check_for_ordered(self, op): """ assert that we are ordered """ if not self.ordered: raise TypeError( - "Categorical is not ordered for operation {op}\n" + f"Categorical is not ordered for operation {op}\n" "you can use .as_ordered() to change the " - "Categorical to an ordered one\n".format(op=op) + "Categorical to an ordered one\n" ) def _values_for_argsort(self): @@ -1680,8 +1667,7 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"): """ inplace = validate_bool_kwarg(inplace, "inplace") if na_position not in ["last", "first"]: - msg = "invalid na_position: {na_position!r}" - raise ValueError(msg.format(na_position=na_position)) + raise ValueError(f"invalid na_position: {na_position!r}") sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) @@ -1837,8 +1823,7 @@ def fillna(self, value=None, method=None, limit=None): else: raise TypeError( '"value" parameter must be a scalar, dict ' - "or Series, but you passed a " - '"{0}"'.format(type(value).__name__) + f'or Series, but you passed a {type(value).__name__!r}"' ) return self._constructor(codes, dtype=self.dtype, fastpath=True) @@ -1931,8 +1916,11 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): if fill_value in self.categories: fill_value = self.categories.get_loc(fill_value) else: - msg = "'fill_value' ('{}') is not in this Categorical's categories." - raise TypeError(msg.format(fill_value)) + msg = ( + f"'fill_value' ('{fill_value}') is not in this " + "Categorical's categories." + ) + raise TypeError(msg) codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) result = type(self).from_codes(codes, dtype=dtype) @@ -1940,7 +1928,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): take = take_nd - def __len__(self): + def __len__(self) -> int: """ The length of this Categorical. """ @@ -1970,11 +1958,9 @@ def _tidy_repr(self, max_vals=10, footer=True): head = self[:num]._get_repr(length=False, footer=False) tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) - result = "{head}, ..., {tail}".format(head=head[:-1], tail=tail[1:]) + result = f"{head[:-1]}, ..., {tail[1:]}" if footer: - result = "{result}\n{footer}".format( - result=result, footer=self._repr_footer() - ) + result = f"{result}\n{self._repr_footer()}" return str(result) @@ -2008,9 +1994,7 @@ def _repr_categories_info(self): category_strs = self._repr_categories() dtype = str(self.categories.dtype) - levheader = "Categories ({length}, {dtype}): ".format( - length=len(self.categories), dtype=dtype - ) + levheader = f"Categories ({len(self.categories)}, {dtype}): " width, height = get_terminal_size() max_width = get_option("display.width") or width if console.in_ipython_frontend(): @@ -2034,10 +2018,8 @@ def _repr_categories_info(self): return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" def _repr_footer(self): - - return "Length: {length}\n{info}".format( - length=len(self), info=self._repr_categories_info() - ) + info = self._repr_categories_info() + return f"Length: {len(self)}\n{info}" def _get_repr(self, length=True, na_rep="NaN", footer=True): from pandas.io.formats import format as fmt @@ -2059,7 +2041,7 @@ def __repr__(self) -> str: result = self._get_repr(length=len(self) > _maxlen) else: msg = self._get_repr(length=False, footer=True).replace("\n", ", ") - result = "[], {repr_msg}".format(repr_msg=msg) + result = f"[], {msg}" return result @@ -2190,8 +2172,7 @@ def _reverse_indexer(self): def _reduce(self, name, axis=0, **kwargs): func = getattr(self, name, None) if func is None: - msg = "Categorical cannot perform the operation {op}" - raise TypeError(msg.format(op=name)) + raise TypeError(f"Categorical cannot perform the operation {name}") return func(**kwargs) def min(self, numeric_only=None, **kwargs): @@ -2459,11 +2440,10 @@ def isin(self, values): array([ True, False, True, False, True, False]) """ if not is_list_like(values): + values_type = type(values).__name__ raise TypeError( "only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]".format( - values_type=type(values).__name__ - ) + f" to isin(), you passed a [{values_type}]" ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) @@ -2471,6 +2451,51 @@ def isin(self, values): code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) + def replace(self, to_replace, value, inplace: bool = False): + """ + Replaces all instances of one value with another + + Parameters + ---------- + to_replace: object + The value to be replaced + + value: object + The value to replace it with + + inplace: bool + Whether the operation is done in-place + + Returns + ------- + None if inplace is True, otherwise the new Categorical after replacement + + + Examples + -------- + >>> s = pd.Categorical([1, 2, 1, 3]) + >>> s.replace(1, 3) + [3, 3, 2, 3] + Categories (2, int64): [2, 3] + """ + inplace = validate_bool_kwarg(inplace, "inplace") + cat = self if inplace else self.copy() + if to_replace in cat.categories: + if isna(value): + cat.remove_categories(to_replace, inplace=True) + else: + categories = cat.categories.tolist() + index = categories.index(to_replace) + if value in cat.categories: + value_index = categories.index(value) + cat._codes[cat._codes == index] = value_index + cat.remove_categories(to_replace, inplace=True) + else: + categories[index] = value + cat.rename_categories(categories, inplace=True) + if not inplace: + return cat + # The Series.cat accessor diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4b83dd0cfff09..dc3c49b7e06a9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -33,12 +33,7 @@ is_unsigned_integer_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCPeriodArray, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodArray, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna @@ -46,6 +41,7 @@ from pandas.core import missing, nanops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts import pandas.core.common as com +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import make_invalid_op from pandas.tseries import frequencies @@ -55,7 +51,7 @@ class AttributesMixin: - _data = None # type: np.ndarray + _data: np.ndarray @classmethod def _simple_new(cls, values, **kwargs): @@ -179,7 +175,8 @@ def strftime(self, date_format): 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - return self._format_native_types(date_format=date_format).astype(object) + result = self._format_native_types(date_format=date_format, na_rep=np.nan) + return result.astype(object) class TimelikeOps: @@ -396,7 +393,7 @@ def size(self) -> int: """The number of elements in this array.""" return np.prod(self.shape) - def __len__(self): + def __len__(self) -> int: return len(self._data) def __getitem__(self, key): @@ -1193,13 +1190,11 @@ def _time_shift(self, periods, freq=None): # to be passed explicitly. return self._generate_range(start=start, end=end, periods=None, freq=self.freq) + @unpack_zerodim_and_defer("__add__") def __add__(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): - return NotImplemented # scalar others - elif other is NaT: + if other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) @@ -1247,13 +1242,11 @@ def __radd__(self, other): # alias for __add__ return self.__add__(other) + @unpack_zerodim_and_defer("__sub__") def __sub__(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): - return NotImplemented # scalar others - elif other is NaT: + if other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) @@ -1303,6 +1296,9 @@ def __rsub__(self, other): if is_datetime64_any_dtype(other) and is_timedelta64_dtype(self.dtype): # ndarray[datetime64] cannot be subtracted from self, so # we need to wrap in DatetimeArray/Index and flip the operation + if lib.is_scalar(other): + # i.e. np.datetime64 object + return Timestamp(other) - self if not isinstance(other, DatetimeLikeArrayMixin): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray @@ -1339,11 +1335,11 @@ def __rsub__(self, other): return -(self - other) # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115 - def __iadd__(self, other): + def __iadd__(self, other): # type: ignore # alias for __add__ return self.__add__(other) - def __isub__(self, other): + def __isub__(self, other): # type: ignore # alias for __sub__ return self.__sub__(other) @@ -1490,7 +1486,7 @@ def mean(self, skipna=True): values = self if not len(values): - # short-circut for empty max / min + # short-circuit for empty max / min return NaT result = nanops.nanmean(values.view("i8"), skipna=skipna) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7cd103d12fa8a..71420e6e58090 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -40,12 +40,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCPandasArray, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -53,6 +48,7 @@ from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import get_period_alias, to_offset @@ -157,11 +153,8 @@ def _dt_array_cmp(cls, op): opname = "__{name}__".format(name=op.__name__) nat_result = opname == "__ne__" + @unpack_zerodim_and_defer(opname) def wrapper(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - - other = lib.item_from_zerodim(other) if isinstance(other, (datetime, np.datetime64, str)): if isinstance(other, (datetime, np.datetime64)): @@ -327,7 +320,7 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps # ----------------------------------------------------------------- # Constructors - _dtype = None # type: Union[np.dtype, DatetimeTZDtype] + _dtype: Union[np.dtype, DatetimeTZDtype] _freq = None def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 08b53e54b91ef..12b76df9a5983 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -21,12 +21,12 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -40,9 +40,9 @@ class _IntegerDtype(ExtensionDtype): The attributes name & type are set when these subclasses are created. """ - name = None # type: str + name: str base = None - type = None # type: Type + type: Type na_value = np.nan def __repr__(self) -> str: @@ -85,6 +85,35 @@ def construct_array_type(cls): """ return IntegerArray + def __from_arrow__(self, array): + """Construct IntegerArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + buflist = arr.buffers() + data = np.frombuffer(buflist[1], dtype=self.type)[ + arr.offset : arr.offset + len(arr) + ] + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask] + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + int_arr = IntegerArray(data.copy(), ~mask, copy=False) + results.append(int_arr) + + return IntegerArray._concat_same_type(results) + def integer_array(values, dtype=None, copy=False): """ @@ -469,7 +498,7 @@ def __setitem__(self, key, value): self._data[key] = value self._mask[key] = mask - def __len__(self): + def __len__(self) -> int: return len(self._data) @property @@ -602,13 +631,8 @@ def _values_for_argsort(self) -> np.ndarray: def _create_comparison_method(cls, op): op_name = op.__name__ + @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): - - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) mask = None if isinstance(other, IntegerArray): @@ -652,7 +676,7 @@ def _reduce(self, name, skipna=True, **kwargs): data[mask] = self._na_value op = getattr(nanops, "nan" + name) - result = op(data, axis=0, skipna=skipna, mask=mask) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) # if we have a boolean op, don't coerce if name in ["any", "all"]: @@ -697,15 +721,14 @@ def _maybe_mask_result(self, result, mask, other, op_name): def _create_arithmetic_method(cls, op): op_name = op.__name__ + @unpack_zerodim_and_defer(op.__name__) def integer_arithmetic_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) mask = None + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + if isinstance(other, IntegerArray): other, mask = other._data, other._mask diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index cc41797e7872b..cb482665b3534 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -489,7 +489,7 @@ def _validate(self): def __iter__(self): return iter(np.asarray(self)) - def __len__(self): + def __len__(self) -> int: return len(self.left) def __getitem__(self, value): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 78cc54db4b1b8..41a8c48452647 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -4,7 +4,6 @@ import numpy as np -from pandas._libs import lib from pandas._libs.tslibs import ( NaT, NaTType, @@ -35,7 +34,6 @@ ) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, @@ -46,6 +44,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick @@ -69,13 +68,10 @@ def _period_array_cmp(cls, op): opname = "__{name}__".format(name=op.__name__) nat_result = opname == "__ne__" + @unpack_zerodim_and_defer(opname) def wrapper(self, other): ordinal_op = getattr(self.asi8, opname) - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - if is_list_like(other) and len(other) != len(self): raise ValueError("Lengths must match") @@ -165,7 +161,7 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): _scalar_type = Period # Names others delegate to us - _other_ops = [] # type: List[str] + _other_ops: List[str] = [] _bool_ops = ["is_leap_year"] _object_ops = ["start_time", "end_time", "freq"] _field_ops = [ @@ -600,7 +596,7 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): if date_format: formatter = lambda dt: dt.strftime(date_format) else: - formatter = lambda dt: "%s" % dt + formatter = lambda dt: str(dt) if self._hasnans: mask = self._isnan @@ -898,9 +894,9 @@ def period_array( data = np.asarray(data) + dtype: Optional[PeriodDtype] if freq: - # typed Optional here because the else block below assigns None - dtype = PeriodDtype(freq) # type: Optional[PeriodDtype] + dtype = PeriodDtype(freq) else: dtype = None diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 14024401ea110..943dea4252499 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -34,12 +34,7 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCSeries, - ABCSparseArray, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries, ABCSparseArray from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.algorithms as algos @@ -49,6 +44,7 @@ from pandas.core.construction import sanitize_array from pandas.core.missing import interpolate_2d import pandas.core.ops as ops +from pandas.core.ops.common import unpack_zerodim_and_defer import pandas.io.formats.printing as printing @@ -1410,12 +1406,8 @@ def sparse_unary_method(self): def _create_arithmetic_method(cls, op): op_name = op.__name__ + @unpack_zerodim_and_defer(op_name) def sparse_arithmetic_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) if isinstance(other, SparseArray): return _sparse_array_op(self, other, op, op_name) @@ -1463,12 +1455,9 @@ def _create_comparison_method(cls, op): if op_name in {"and_", "or_"}: op_name = op_name[:-1] + @unpack_zerodim_and_defer(op_name) def cmp_method(self, other): - if isinstance(other, (ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - if not is_scalar(other) and not isinstance(other, type(self)): # convert list-like to ndarray other = np.asarray(other) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 11c27451a5801..6ae2903d9826c 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -51,7 +51,7 @@ def _get_label_to_i_dict(labels, sort_labels=False): """ labels = Index(map(tuple, labels)).unique().tolist() # squish if sort_labels: - labels = sorted(list(labels)) + labels = sorted(labels) d = OrderedDict((k, i) for i, k in enumerate(labels)) return d diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7c487b227de20..8599b5e39f34a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -85,6 +85,24 @@ def construct_array_type(cls) -> "Type[StringArray]": def __repr__(self) -> str: return "StringDtype" + def __from_arrow__(self, array): + """Construct StringArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # using _from_sequence to ensure None is convered to np.nan + str_arr = StringArray._from_sequence(np.array(arr)) + results.append(str_arr) + + return StringArray._concat_same_type(results) + class StringArray(PandasArray): """ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 21e07b5101a64..bacd0b9699e93 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -45,6 +45,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr import pandas.core.common as com +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import to_offset @@ -82,10 +83,8 @@ def _td_array_cmp(cls, op): opname = "__{name}__".format(name=op.__name__) nat_result = opname == "__ne__" + @unpack_zerodim_and_defer(opname) def wrapper(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented if _is_convertible_to_td(other) or other is NaT: try: @@ -162,8 +161,8 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _scalar_type = Timedelta __array_priority__ = 1000 # define my properties & methods for delegation - _other_ops = [] # type: List[str] - _bool_ops = [] # type: List[str] + _other_ops: List[str] = [] + _bool_ops: List[str] = [] _object_ops = ["freq"] _field_ops = ["days", "seconds", "microseconds", "nanoseconds"] _datetimelike_ops = _field_ops + _object_ops + _bool_ops diff --git a/pandas/core/base.py b/pandas/core/base.py index 10e7b5d186bba..176a92132e20a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -36,7 +36,7 @@ from pandas.core.arrays import ExtensionArray import pandas.core.nanops as nanops -_shared_docs = dict() # type: Dict[str, str] +_shared_docs: Dict[str, str] = dict() _indexops_doc_kwargs = dict( klass="IndexOpsMixin", inplace="", @@ -283,9 +283,7 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): # people may try to aggregate on a non-callable attribute # but don't let them think they can pass args to it assert len(args) == 0 - assert ( - len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0 - ) + assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 return f f = getattr(np, arg, None) @@ -324,34 +322,17 @@ def _aggregate(self, arg, *args, **kwargs): _axis = kwargs.pop("_axis", None) if _axis is None: _axis = getattr(self, "axis", 0) - _level = kwargs.pop("_level", None) if isinstance(arg, str): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): - # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") obj = self._selected_obj - def nested_renaming_depr(level: int = 4): - # deprecation of nested renaming - # GH 15931 - msg = textwrap.dedent( - """\ - using a dict with renaming is deprecated and will be removed - in a future version. - - For column-specific groupby renaming, use named aggregation - - >>> df.groupby(...).agg(name=('column', aggfunc)) - """ - ) - warnings.warn(msg, FutureWarning, stacklevel=level) - # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes @@ -374,18 +355,9 @@ def nested_renaming_depr(level: int = 4): # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): - is_nested_renamer = True - - if k not in obj.columns: - msg = ( - "cannot perform renaming for {key} with a " - "nested dictionary" - ).format(key=k) - raise SpecificationError(msg) - nested_renaming_depr(4 + (_level or 0)) - + raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCSeries): - nested_renaming_depr() + raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCDataFrame) and k not in obj.columns: raise KeyError("Column '{col}' does not exist!".format(col=k)) @@ -398,7 +370,7 @@ def nested_renaming_depr(level: int = 4): if isinstance(obj, ABCDataFrame) and len( obj.columns.intersection(keys) ) != len(keys): - nested_renaming_depr() + raise SpecificationError("nested renamer is not supported") from pandas.core.reshape.concat import concat @@ -411,14 +383,14 @@ def _agg_1dim(name, how, subset=None): raise SpecificationError( "nested dictionary is ambiguous in aggregation" ) - return colg.aggregate(how, _level=(_level or 0) + 1) + return colg.aggregate(how) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) - return colg.aggregate(how, _level=None) + return colg.aggregate(how) def _agg(arg, func): """ @@ -535,7 +507,7 @@ def is_any_frame() -> bool: return result, True elif is_list_like(arg): # we require a list, but not an 'str' - return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None + return self._aggregate_multiple_funcs(arg, _axis=_axis), None else: result = None @@ -546,7 +518,7 @@ def is_any_frame() -> bool: # caller can react return result, True - def _aggregate_multiple_funcs(self, arg, _level, _axis): + def _aggregate_multiple_funcs(self, arg, _axis): from pandas.core.reshape.concat import concat if _axis != 0: @@ -631,7 +603,7 @@ def _is_builtin_func(self, arg): class ShallowMixin: - _attributes = [] # type: List[str] + _attributes: List[str] = [] def _shallow_copy(self, obj=None, **kwargs): """ @@ -655,7 +627,7 @@ class IndexOpsMixin: # ndarray compatibility __array_priority__ = 1000 - _deprecations = frozenset( + _deprecations: FrozenSet[str] = frozenset( [ "tolist", # tolist is not deprecated, just suppressed in the __dir__ "base", @@ -665,7 +637,7 @@ class IndexOpsMixin: "flags", "strides", ] - ) # type: FrozenSet[str] + ) def transpose(self, *args, **kwargs): """ @@ -686,7 +658,7 @@ def transpose(self, *args, **kwargs): ) @property - def _is_homogeneous_type(self): + def _is_homogeneous_type(self) -> bool: """ Whether the object has a single dtype. @@ -711,7 +683,7 @@ def shape(self): return self._values.shape @property - def ndim(self): + def ndim(self) -> int: """ Number of dimensions of the underlying data, by definition 1. """ @@ -1467,7 +1439,7 @@ def is_monotonic(self): is_monotonic_increasing = is_monotonic @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return boolean if values in the object are monotonic_decreasing. diff --git a/pandas/core/common.py b/pandas/core/common.py index 565f5076fdddb..41b6ebbd2f196 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -5,7 +5,7 @@ """ import collections -from collections import OrderedDict, abc +from collections import abc from datetime import datetime, timedelta from functools import partial import inspect @@ -14,7 +14,6 @@ import numpy as np from pandas._libs import lib, tslibs -from pandas.compat import PY36 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -215,16 +214,6 @@ def try_sort(iterable): return listed -def dict_keys_to_ordered_list(mapping): - # when pandas drops support for Python < 3.6, this function - # can be replaced by a simple list(mapping.keys()) - if PY36 or isinstance(mapping, OrderedDict): - keys = list(mapping.keys()) - else: - keys = try_sort(mapping) - return keys - - def asarray_tuplesafe(values, dtype=None): if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")): @@ -462,7 +451,7 @@ def pipe(obj, func, *args, **kwargs): if isinstance(func, tuple): func, target = func if target in kwargs: - msg = "%s is both the pipe target and a keyword argument" % target + msg = f"{target} is both the pipe target and a keyword argument" raise ValueError(msg) kwargs[target] = obj return func(*args, **kwargs) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 3e1e5ed89d877..197ddd999fd37 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -8,10 +8,11 @@ from pandas.errors import PerformanceWarning -import pandas as pd +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.computation.common import _result_type_many +from pandas.core.computation.common import result_type_many def _align_core_single_unary_op(term): @@ -33,7 +34,7 @@ def _zip_axes_from_type(typ, new_axes): return axes -def _any_pandas_objects(terms): +def _any_pandas_objects(terms) -> bool: """Check a sequence of terms for instances of PandasObject.""" return any(isinstance(term.value, PandasObject) for term in terms) @@ -49,7 +50,7 @@ def wrapper(terms): # we don't have any pandas objects if not _any_pandas_objects(terms): - return _result_type_many(*term_values), None + return result_type_many(*term_values), None return f(terms) @@ -60,7 +61,10 @@ def wrapper(terms): def _align_core(terms): term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] term_dims = [terms[i].value.ndim for i in term_index] - ndims = pd.Series(dict(zip(term_index, term_dims))) + + from pandas import Series + + ndims = Series(dict(zip(term_index, term_dims))) # initial axes are the axes of the largest-axis'd term biggest = terms[ndims.idxmax()].value @@ -70,7 +74,7 @@ def _align_core(terms): gt_than_one_axis = naxes > 1 for value in (terms[i].value for i in term_index): - is_series = isinstance(value, pd.Series) + is_series = isinstance(value, ABCSeries) is_series_and_gt_one_axis = is_series and gt_than_one_axis for axis, items in enumerate(value.axes): @@ -87,7 +91,7 @@ def _align_core(terms): ti = terms[i].value if hasattr(ti, "reindex"): - transpose = isinstance(ti, pd.Series) and naxes > 1 + transpose = isinstance(ti, ABCSeries) and naxes > 1 reindexer = axes[naxes - 1] if transpose else items term_axis_size = len(ti.axes[axis]) @@ -111,28 +115,28 @@ def _align_core(terms): return typ, _zip_axes_from_type(typ, axes) -def _align(terms): +def align_terms(terms): """Align a set of terms""" try: # flatten the parse tree (a nested list, really) terms = list(com.flatten(terms)) except TypeError: # can't iterate so it must just be a constant or single variable - if isinstance(terms.value, pd.core.generic.NDFrame): + if isinstance(terms.value, (ABCSeries, ABCDataFrame)): typ = type(terms.value) return typ, _zip_axes_from_type(typ, terms.value.axes) return np.result_type(terms.type), None # if all resolved variables are numeric scalars if all(term.is_scalar for term in terms): - return _result_type_many(*(term.value for term in terms)).type, None + return result_type_many(*(term.value for term in terms)).type, None # perform the main alignment typ, axes = _align_core(terms) return typ, axes -def _reconstruct_object(typ, obj, axes, dtype): +def reconstruct_object(typ, obj, axes, dtype): """ Reconstruct an object given its type, raw value, and possibly empty (None) axes. diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index bd32c8bee1cdf..da47449d5e62e 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -15,7 +15,7 @@ def _ensure_decoded(s): return s -def _result_type_many(*arrays_and_dtypes): +def result_type_many(*arrays_and_dtypes): """ wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) argument limit """ try: diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index dc6378e83d229..2f3c519d352c6 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -4,7 +4,7 @@ import abc -from pandas.core.computation.align import _align, _reconstruct_object +from pandas.core.computation.align import align_terms, reconstruct_object from pandas.core.computation.ops import UndefinedVariableError, _mathops, _reductions import pandas.io.formats.printing as printing @@ -46,8 +46,9 @@ def __init__(self, expr): self.aligned_axes = None self.result_type = None - def convert(self): - """Convert an expression for evaluation. + def convert(self) -> str: + """ + Convert an expression for evaluation. Defaults to return the expression as a string. """ @@ -66,16 +67,16 @@ def evaluate(self): The result of the passed expression. """ if not self._is_aligned: - self.result_type, self.aligned_axes = _align(self.expr.terms) + self.result_type, self.aligned_axes = align_terms(self.expr.terms) # make sure no names in resolvers and locals/globals clash res = self._evaluate() - return _reconstruct_object( + return reconstruct_object( self.result_type, res, self.aligned_axes, self.expr.terms.return_type ) @property - def _is_aligned(self): + def _is_aligned(self) -> bool: return self.aligned_axes is not None and self.result_type is not None @abc.abstractmethod @@ -104,7 +105,7 @@ class NumExprEngine(AbstractEngine): def __init__(self, expr): super().__init__(expr) - def convert(self): + def convert(self) -> str: return str(super().convert()) def _evaluate(self): diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 461561a80a7e5..72f2e1d8e23e5 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -10,7 +10,8 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.computation.engines import _engines -from pandas.core.computation.scope import _ensure_scope +from pandas.core.computation.expr import Expr, _parsers, tokenize_string +from pandas.core.computation.scope import ensure_scope from pandas.io.formats.printing import pprint_thing @@ -64,7 +65,7 @@ def _check_engine(engine): return engine -def _check_parser(parser): +def _check_parser(parser: str): """ Make sure a valid parser is passed. @@ -77,7 +78,6 @@ def _check_parser(parser): KeyError * If an invalid parser is passed """ - from pandas.core.computation.expr import _parsers if parser not in _parsers: raise KeyError( @@ -115,7 +115,7 @@ def _check_expression(expr): raise ValueError("expr cannot be an empty string") -def _convert_expression(expr): +def _convert_expression(expr) -> str: """ Convert an object to an expression. @@ -131,7 +131,7 @@ def _convert_expression(expr): Returns ------- - s : unicode + str The string representation of an object. Raises @@ -144,8 +144,7 @@ def _convert_expression(expr): return s -def _check_for_locals(expr, stack_level, parser): - from pandas.core.computation.expr import tokenize_string +def _check_for_locals(expr: str, stack_level: int, parser: str): at_top_of_stack = stack_level == 0 not_pandas_parser = parser != "pandas" @@ -192,7 +191,7 @@ def eval( Parameters ---------- - expr : str or unicode + expr : str The expression to evaluate. This string cannot contain any Python `statements `__, @@ -282,7 +281,6 @@ def eval( See the :ref:`enhancing performance ` documentation for more details. """ - from pandas.core.computation.expr import Expr inplace = validate_bool_kwarg(inplace, "inplace") @@ -311,7 +309,7 @@ def eval( _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope - env = _ensure_scope( + env = ensure_scope( level + 1, global_dict=global_dict, local_dict=local_dict, diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 39653c3d695b2..253d64d50d0cd 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -11,7 +11,6 @@ import numpy as np -import pandas as pd import pandas.core.common as com from pandas.core.computation.common import ( _BACKTICK_QUOTED_STRING, @@ -40,7 +39,7 @@ import pandas.io.formats.printing as printing -def tokenize_string(source): +def tokenize_string(source: str): """ Tokenize a Python source code string. @@ -171,7 +170,7 @@ def _compose(*funcs): def _preparse( - source, + source: str, f=_compose( _replace_locals, _replace_booleans, @@ -379,7 +378,7 @@ class BaseExprVisitor(ast.NodeVisitor): preparser : callable """ - const_type = Constant # type: Type[Term] + const_type: Type[Term] = Constant term_type = Term binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms @@ -600,6 +599,8 @@ def visit_Index(self, node, **kwargs): return self.visit(node.value) def visit_Subscript(self, node, **kwargs): + import pandas as pd + value = self.visit(node.value) slobj = self.visit(node.slice) result = pd.eval( @@ -837,7 +838,7 @@ def __call__(self): def __repr__(self) -> str: return printing.pprint_thing(self.terms) - def __len__(self): + def __len__(self) -> int: return len(self.expr) def parse(self): diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 46bc762e1a0b3..77999d2c166fd 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -62,9 +62,8 @@ def set_numexpr_threads(n=None): ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b, reversed=False): +def _evaluate_standard(op, op_str, a, b): """ standard evaluation """ - # `reversed` kwarg is included for compatibility with _evaluate_numexpr if _TEST_MODE: _store_test_result(False) with np.errstate(all="ignore"): @@ -97,11 +96,12 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, reversed=False): +def _evaluate_numexpr(op, op_str, a, b): result = None if _can_use_numexpr(op, op_str, a, b, "evaluate"): - if reversed: + is_reversed = op.__name__.strip("_").startswith("r") + if is_reversed: # we were originally called by a reversed op method a, b = b, a @@ -190,7 +190,7 @@ def _bool_arith_check( return True -def evaluate(op, op_str, a, b, use_numexpr=True, reversed=False): +def evaluate(op, op_str, a, b, use_numexpr=True): """ Evaluate and return the expression of the op on a and b. @@ -203,12 +203,11 @@ def evaluate(op, op_str, a, b, use_numexpr=True, reversed=False): b : right operand use_numexpr : bool, default True Whether to try to use numexpr. - reversed : bool, default False """ use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: - return _evaluate(op, op_str, a, b, reversed=reversed) + return _evaluate(op, op_str, a, b) return _evaluate_standard(op, op_str, a, b) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index fe74b6994be7c..41d7f96f5e96d 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -13,7 +13,7 @@ from pandas.core.dtypes.common import is_list_like, is_scalar import pandas.core.common as com -from pandas.core.computation.common import _ensure_decoded, _result_type_many +from pandas.core.computation.common import _ensure_decoded, result_type_many from pandas.core.computation.scope import _DEFAULT_GLOBALS from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded @@ -55,7 +55,7 @@ class UndefinedVariableError(NameError): NameError subclass for local variables. """ - def __init__(self, name, is_local): + def __init__(self, name, is_local: bool): if is_local: msg = "local variable {0!r} is not defined" else: @@ -69,7 +69,10 @@ def __new__(cls, name, env, side=None, encoding=None): supr_new = super(Term, klass).__new__ return supr_new(klass) + is_local: bool + def __init__(self, name, env, side=None, encoding=None): + # name is a str for Term, but may be something else for subclasses self._name = name self.env = env self.side = side @@ -79,7 +82,7 @@ def __init__(self, name, env, side=None, encoding=None): self.encoding = encoding @property - def local_name(self): + def local_name(self) -> str: return self.name.replace(_LOCAL_TAG, "") def __repr__(self) -> str: @@ -120,7 +123,7 @@ def update(self, value): self.value = value @property - def is_scalar(self): + def is_scalar(self) -> bool: return is_scalar(self._value) @property @@ -139,14 +142,14 @@ def type(self): return_type = type @property - def raw(self): + def raw(self) -> str: return pprint_thing( "{0}(name={1!r}, type={2})" "".format(self.__class__.__name__, self.name, self.type) ) @property - def is_datetime(self): + def is_datetime(self) -> bool: try: t = self.type.type except AttributeError: @@ -167,7 +170,7 @@ def name(self): return self._name @property - def ndim(self): + def ndim(self) -> int: return self._value.ndim @@ -196,7 +199,9 @@ class Op: Hold an operator of arbitrary arity. """ - def __init__(self, op, operands, *args, **kwargs): + op: str + + def __init__(self, op: str, operands, *args, **kwargs): self.op = _bool_op_map.get(op, op) self.operands = operands self.encoding = kwargs.get("encoding", None) @@ -217,10 +222,10 @@ def return_type(self): # clobber types to bool if the op is a boolean operator if self.op in (_cmp_ops_syms + _bool_ops_syms): return np.bool_ - return _result_type_many(*(term.type for term in com.flatten(self))) + return result_type_many(*(term.type for term in com.flatten(self))) @property - def has_invalid_return_type(self): + def has_invalid_return_type(self) -> bool: types = self.operand_types obj_dtype_set = frozenset([np.dtype("object")]) return self.return_type == object and types - obj_dtype_set @@ -230,11 +235,11 @@ def operand_types(self): return frozenset(term.type for term in com.flatten(self)) @property - def is_scalar(self): + def is_scalar(self) -> bool: return all(operand.is_scalar for operand in self.operands) @property - def is_datetime(self): + def is_datetime(self) -> bool: try: t = self.return_type.type except AttributeError: @@ -339,7 +344,7 @@ def _cast_inplace(terms, acceptable_dtypes, dtype): term.update(new_value) -def is_term(obj): +def is_term(obj) -> bool: return isinstance(obj, Term) @@ -354,7 +359,7 @@ class BinOp(Op): right : Term or Op """ - def __init__(self, op, lhs, rhs, **kwargs): + def __init__(self, op: str, lhs, rhs, **kwargs): super().__init__(op, (lhs, rhs)) self.lhs = lhs self.rhs = rhs @@ -396,7 +401,7 @@ def __call__(self, env): return self.func(left, right) - def evaluate(self, env, engine, parser, term_type, eval_in_python): + def evaluate(self, env, engine: str, parser, term_type, eval_in_python): """ Evaluate a binary operation *before* being passed to the engine. @@ -488,7 +493,7 @@ def _disallow_scalar_only_bool_ops(self): raise NotImplementedError("cannot evaluate scalar only bool ops") -def isnumeric(dtype): +def isnumeric(dtype) -> bool: return issubclass(np.dtype(dtype).type, np.number) @@ -505,8 +510,8 @@ class Div(BinOp): regardless of the value of ``truediv``. """ - def __init__(self, lhs, rhs, truediv, *args, **kwargs): - super().__init__("/", lhs, rhs, *args, **kwargs) + def __init__(self, lhs, rhs, truediv: bool, **kwargs): + super().__init__("/", lhs, rhs, **kwargs) if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): raise TypeError( @@ -541,7 +546,7 @@ class UnaryOp(Op): * If no function associated with the passed operator token is found. """ - def __init__(self, op, operand): + def __init__(self, op: str, operand): super().__init__(op, (operand,)) self.operand = operand @@ -561,7 +566,7 @@ def __repr__(self) -> str: return pprint_thing("{0}({1})".format(self.op, self.operand)) @property - def return_type(self): + def return_type(self) -> np.dtype: operand = self.operand if operand.return_type == np.dtype("bool"): return np.dtype("bool") @@ -588,7 +593,7 @@ def __repr__(self) -> str: class FuncNode: - def __init__(self, name): + def __init__(self, name: str): from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION if name not in _mathops or ( diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 3a2ea30cbc8b9..58bbfd0a1bdee 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -2,6 +2,7 @@ import ast from functools import partial +from typing import Any, Dict, Optional, Tuple import numpy as np @@ -12,7 +13,7 @@ import pandas as pd import pandas.core.common as com -from pandas.core.computation import expr, ops +from pandas.core.computation import expr, ops, scope as _scope from pandas.core.computation.common import _ensure_decoded from pandas.core.computation.expr import BaseExprVisitor from pandas.core.computation.ops import UndefinedVariableError, is_term @@ -20,25 +21,36 @@ from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded -class Scope(expr.Scope): +class PyTablesScope(_scope.Scope): __slots__ = ("queryables",) - def __init__(self, level, global_dict=None, local_dict=None, queryables=None): + queryables: Dict[str, Any] + + def __init__( + self, + level: int, + global_dict=None, + local_dict=None, + queryables: Optional[Dict[str, Any]] = None, + ): super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) self.queryables = queryables or dict() class Term(ops.Term): + env: PyTablesScope + def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls return object.__new__(klass) - def __init__(self, name, env, side=None, encoding=None): + def __init__(self, name, env: PyTablesScope, side=None, encoding=None): super().__init__(name, env, side=side, encoding=encoding) def _resolve_name(self): # must be a queryables if self.side == "left": + # Note: The behavior of __new__ ensures that self.name is a str here if self.name not in self.env.queryables: raise NameError("name {name!r} is not defined".format(name=self.name)) return self.name @@ -56,7 +68,8 @@ def value(self): class Constant(Term): - def __init__(self, value, env, side=None, encoding=None): + def __init__(self, value, env: PyTablesScope, side=None, encoding=None): + assert isinstance(env, PyTablesScope), type(env) super().__init__(value, env, side=side, encoding=encoding) def _resolve_name(self): @@ -67,11 +80,13 @@ class BinOp(ops.BinOp): _max_selectors = 31 - def __init__(self, op, lhs, rhs, queryables, encoding): + op: str + queryables: Dict[str, Any] + + def __init__(self, op: str, lhs, rhs, queryables: Dict[str, Any], encoding): super().__init__(op, lhs, rhs) self.queryables = queryables self.encoding = encoding - self.filter = None self.condition = None def _disallow_scalar_only_bool_ops(self): @@ -129,12 +144,12 @@ def conform(self, rhs): return rhs @property - def is_valid(self): + def is_valid(self) -> bool: """ return True if this is a valid field """ return self.lhs in self.queryables @property - def is_in_table(self): + def is_in_table(self) -> bool: """ return True if this is a valid column name for generation (e.g. an actual column in the table) """ return self.queryables.get(self.lhs) is not None @@ -154,12 +169,12 @@ def metadata(self): """ the metadata of my field """ return getattr(self.queryables.get(self.lhs), "metadata", None) - def generate(self, v): + def generate(self, v) -> str: """ create and return the op string for this TermValue """ val = v.tostring(self.encoding) return "({lhs} {op} {val})".format(lhs=self.lhs, op=self.op, val=val) - def convert_value(self, v): + def convert_value(self, v) -> "TermValue": """ convert the expression that is in the term to something that is accepted by pytables """ @@ -229,7 +244,11 @@ def convert_values(self): class FilterBinOp(BinOp): + filter: Optional[Tuple[Any, Any, pd.Index]] = None + def __repr__(self) -> str: + if self.filter is None: + return "Filter: Not Initialized" return pprint_thing( "[Filter : [{lhs}] -> [{op}]".format(lhs=self.filter[0], op=self.filter[1]) ) @@ -252,7 +271,7 @@ def evaluate(self): raise ValueError("query term is not valid [{slf}]".format(slf=self)) rhs = self.conform(self.rhs) - values = [TermValue(v, v, self.kind).value for v in rhs] + values = list(rhs) if self.is_in_table: @@ -279,7 +298,7 @@ def evaluate(self): return self - def generate_filter_op(self, invert=False): + def generate_filter_op(self, invert: bool = False): if (self.op == "!=" and not invert) or (self.op == "==" and invert): return lambda axis, vals: ~axis.isin(vals) else: @@ -368,10 +387,7 @@ def prune(self, klass): return None -_op_classes = {"unary": UnaryOp} - - -class ExprVisitor(BaseExprVisitor): +class PyTablesExprVisitor(BaseExprVisitor): const_type = Constant term_type = Term @@ -471,25 +487,29 @@ def _validate_where(w): TypeError : An invalid data type was passed in for w (e.g. dict). """ - if not (isinstance(w, (Expr, str)) or is_list_like(w)): - raise TypeError("where must be passed as a string, Expr, or list-like of Exprs") + if not (isinstance(w, (PyTablesExpr, str)) or is_list_like(w)): + raise TypeError( + "where must be passed as a string, PyTablesExpr, " + "or list-like of PyTablesExpr" + ) return w -class Expr(expr.Expr): - """ hold a pytables like expression, comprised of possibly multiple 'terms' +class PyTablesExpr(expr.Expr): + """ + Hold a pytables-like expression, comprised of possibly multiple 'terms'. Parameters ---------- - where : string term expression, Expr, or list-like of Exprs + where : string term expression, PyTablesExpr, or list-like of PyTablesExprs queryables : a "kinds" map (dict of column name -> kind), or None if column is non-indexable encoding : an encoding that will encode the query terms Returns ------- - an Expr object + a PyTablesExpr object Examples -------- @@ -505,7 +525,16 @@ class Expr(expr.Expr): "major_axis>=20130101" """ - def __init__(self, where, queryables=None, encoding=None, scope_level=0): + _visitor: Optional[PyTablesExprVisitor] + env: PyTablesScope + + def __init__( + self, + where, + queryables: Optional[Dict[str, Any]] = None, + encoding=None, + scope_level: int = 0, + ): where = _validate_where(where) @@ -518,25 +547,28 @@ def __init__(self, where, queryables=None, encoding=None, scope_level=0): # capture the environment if needed local_dict = DeepChainMap() - if isinstance(where, Expr): + if isinstance(where, PyTablesExpr): local_dict = where.env.scope - where = where.expr + _where = where.expr elif isinstance(where, (list, tuple)): + where = list(where) for idx, w in enumerate(where): - if isinstance(w, Expr): + if isinstance(w, PyTablesExpr): local_dict = w.env.scope else: w = _validate_where(w) where[idx] = w - where = " & ".join(map("({})".format, com.flatten(where))) # noqa + _where = " & ".join(map("({})".format, com.flatten(where))) + else: + _where = where - self.expr = where - self.env = Scope(scope_level + 1, local_dict=local_dict) + self.expr = _where + self.env = PyTablesScope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, str): self.env.queryables.update(queryables) - self._visitor = ExprVisitor( + self._visitor = PyTablesExprVisitor( self.env, queryables=queryables, parser="pytables", @@ -574,30 +606,31 @@ def evaluate(self): class TermValue: """ hold a term value the we use to construct a condition/filter """ - def __init__(self, value, converted, kind): + def __init__(self, value, converted, kind: str): + assert isinstance(kind, str), kind self.value = value self.converted = converted self.kind = kind - def tostring(self, encoding): + def tostring(self, encoding) -> str: """ quote the string if not encoded else encode and return """ if self.kind == "string": if encoding is not None: - return self.converted + return str(self.converted) return '"{converted}"'.format(converted=self.converted) elif self.kind == "float": # python 2 str(float) is not always # round-trippable so use repr() return repr(self.converted) - return self.converted + return str(self.converted) -def maybe_expression(s): +def maybe_expression(s) -> bool: """ loose checking if s is a pytables-acceptable expression """ if not isinstance(s, str): return False - ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ("=",) + ops = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",) # make sure we have an op at least return any(op in s for op in ops) diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 81c7b04bf3284..78a47afcc0830 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -9,6 +9,7 @@ import pprint import struct import sys +from typing import List import numpy as np @@ -16,9 +17,9 @@ from pandas.compat.chainmap import DeepChainMap -def _ensure_scope( - level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs -): +def ensure_scope( + level: int, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs +) -> "Scope": """Ensure that we are grabbing the correct scope.""" return Scope( level + 1, @@ -29,7 +30,7 @@ def _ensure_scope( ) -def _replacer(x): +def _replacer(x) -> str: """Replace a number with its hexadecimal representation. Used to tag temporary variables with their calling scope's id. """ @@ -44,11 +45,11 @@ def _replacer(x): return hex(hexin) -def _raw_hex_id(obj): +def _raw_hex_id(obj) -> str: """Return the padded hexadecimal id of ``obj``.""" # interpret as a pointer since that's what really what id returns packed = struct.pack("@P", id(obj)) - return "".join(map(_replacer, packed)) + return "".join(_replacer(x) for x in packed) _DEFAULT_GLOBALS = { @@ -63,7 +64,7 @@ def _raw_hex_id(obj): } -def _get_pretty_string(obj): +def _get_pretty_string(obj) -> str: """ Return a prettier version of obj. @@ -74,7 +75,7 @@ def _get_pretty_string(obj): Returns ------- - s : str + str Pretty print object repr """ sio = StringIO() @@ -119,7 +120,7 @@ def __init__( self.scope.update(local_dict.scope) if local_dict.target is not None: self.target = local_dict.target - self.update(local_dict.level) + self._update(local_dict.level) frame = sys._getframe(self.level) @@ -148,8 +149,9 @@ def __repr__(self) -> str: ) @property - def has_resolvers(self): - """Return whether we have any extra scope. + def has_resolvers(self) -> bool: + """ + Return whether we have any extra scope. For example, DataFrames pass Their columns as resolvers during calls to ``DataFrame.eval()`` and ``DataFrame.query()``. @@ -160,7 +162,7 @@ def has_resolvers(self): """ return bool(len(self.resolvers)) - def resolve(self, key, is_local): + def resolve(self, key: str, is_local: bool): """ Resolve a variable name in a possibly local context. @@ -202,7 +204,7 @@ def resolve(self, key, is_local): raise UndefinedVariableError(key, is_local) - def swapkey(self, old_key, new_key, new_value=None): + def swapkey(self, old_key: str, new_key: str, new_value=None): """ Replace a variable name, with a potentially new value. @@ -227,7 +229,7 @@ def swapkey(self, old_key, new_key, new_value=None): mapping[new_key] = new_value return - def _get_vars(self, stack, scopes): + def _get_vars(self, stack, scopes: List[str]): """ Get specifically scoped variables from a list of stack frames. @@ -250,13 +252,13 @@ def _get_vars(self, stack, scopes): # scope after the loop del frame - def update(self, level): + def _update(self, level: int): """ Update the current scope by going back `level` levels. Parameters ---------- - level : int or None, optional, default None + level : int """ sl = level + 1 @@ -270,7 +272,7 @@ def update(self, level): finally: del stack[:], stack - def add_tmp(self, value): + def add_tmp(self, value) -> str: """ Add a temporary variable to the scope. @@ -281,7 +283,7 @@ def add_tmp(self, value): Returns ------- - name : basestring + str The name of the temporary variable created. """ name = "{name}_{num}_{hex_id}".format( @@ -297,7 +299,7 @@ def add_tmp(self, value): return name @property - def ntemps(self): + def ntemps(self) -> int: """The number of temporary variables in this scope""" return len(self.temps) diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index 2b527e1fb5890..cb0912cbcf880 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -12,7 +12,6 @@ is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetimetz, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -32,7 +31,6 @@ is_number, is_numeric_dtype, is_object_dtype, - is_period, is_period_dtype, is_re, is_re_compilable, diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 2cc7c44cc05af..8acdf32c8768e 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -81,13 +81,14 @@ def __from_arrow__( provided for registering virtual subclasses. """ - _metadata = () # type: Tuple[str, ...] + _metadata: Tuple[str, ...] = () def __str__(self) -> str: return self.name - def __eq__(self, other): - """Check whether 'other' is equal to self. + def __eq__(self, other) -> bool: + """ + Check whether 'other' is equal to self. By default, 'other' is considered equal if either @@ -115,10 +116,10 @@ def __eq__(self, other): ) return False - def __hash__(self): + def __hash__(self) -> int: return hash(tuple(getattr(self, attr) for attr in self._metadata)) - def __ne__(self, other): + def __ne__(self, other) -> bool: return not self.__eq__(other) @property @@ -171,7 +172,8 @@ def name(self) -> str: @property def names(self) -> Optional[List[str]]: - """Ordered list of field names, or None if there are no fields. + """ + Ordered list of field names, or None if there are no fields. This is for compatibility with NumPy arrays, and may be removed in the future. @@ -233,16 +235,19 @@ def construct_from_string(cls, string: str): ... "'{}'".format(cls.__name__, string)) """ if not isinstance(string, str): - raise TypeError("Expects a string, got {}".format(type(string))) + raise TypeError("Expects a string, got {typ}".format(typ=type(string))) if string != cls.name: raise TypeError( - "Cannot construct a '{}' from '{}'".format(cls.__name__, string) + "Cannot construct a '{cls}' from '{string}'".format( + cls=cls.__name__, string=string + ) ) return cls() @classmethod def is_dtype(cls, dtype) -> bool: - """Check if we match 'dtype'. + """ + Check if we match 'dtype'. Parameters ---------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 637c42eef8a5a..acf8b6ca4e312 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -72,7 +72,7 @@ def maybe_convert_platform(values): return values -def is_nested_object(obj): +def is_nested_object(obj) -> bool: """ return a boolean if we have a nested object, e.g. a Series with 1 or more Series elements @@ -500,11 +500,11 @@ def _ensure_dtype_type(value, dtype): def infer_dtype_from(val, pandas_dtype: bool = False): """ - interpret the dtype from a scalar or array. This is a convenience - routines to infer dtype from a scalar or an array + Interpret the dtype from a scalar or array. Parameters ---------- + val : object pandas_dtype : bool, default False whether to infer dtype including pandas extension types. If False, scalar/array belongs to pandas extension types is inferred as @@ -517,7 +517,7 @@ def infer_dtype_from(val, pandas_dtype: bool = False): def infer_dtype_from_scalar(val, pandas_dtype: bool = False): """ - interpret the dtype from a scalar + Interpret the dtype from a scalar. Parameters ---------- @@ -592,7 +592,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False): def infer_dtype_from_array(arr, pandas_dtype: bool = False): """ - infer the dtype from a scalar or array + Infer the dtype from a scalar or array. Parameters ---------- @@ -647,7 +647,8 @@ def infer_dtype_from_array(arr, pandas_dtype: bool = False): def maybe_infer_dtype_type(element): - """Try to infer an object's dtype, for use in arithmetic ops + """ + Try to infer an object's dtype, for use in arithmetic ops. Uses `element.dtype` if that's available. Objects implementing the iterator protocol are cast to a NumPy array, @@ -679,8 +680,9 @@ def maybe_infer_dtype_type(element): return tipo -def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): - """ provide explicit type promotion and coercion +def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): + """ + Provide explicit type promotion and coercion. Parameters ---------- @@ -759,7 +761,7 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def astype_nansafe(arr, dtype, copy=True, skipna=False): +def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): """ Cast the elements of an array to a given dtype a nan-safe manner. @@ -982,7 +984,7 @@ def soft_convert_objects( return values -def maybe_castable(arr): +def maybe_castable(arr) -> bool: # return False to force a non-fastpath # check datetime64[ns]/timedelta64[ns] are valid @@ -996,7 +998,7 @@ def maybe_castable(arr): return arr.dtype.name not in _POSSIBLY_CAST_DTYPES -def maybe_infer_to_datetimelike(value, convert_dates=False): +def maybe_infer_to_datetimelike(value, convert_dates: bool = False): """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a @@ -1103,7 +1105,7 @@ def try_timedelta(v): return value -def maybe_cast_to_datetime(value, dtype, errors="raise"): +def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ @@ -1292,7 +1294,7 @@ def find_common_type(types): def cast_scalar_to_array(shape, value, dtype=None): """ - create np.ndarray of specified shape and dtype, filled with values + Create np.ndarray of specified shape and dtype, filled with values. Parameters ---------- @@ -1318,7 +1320,7 @@ def cast_scalar_to_array(shape, value, dtype=None): return values -def construct_1d_arraylike_from_scalar(value, length, dtype): +def construct_1d_arraylike_from_scalar(value, length: int, dtype): """ create a np.ndarray / pandas type of specified shape and dtype filled with values @@ -1383,7 +1385,7 @@ def construct_1d_object_array_from_listlike(values): return result -def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): +def construct_1d_ndarray_preserving_na(values, dtype=None, copy: bool = False): """ Construct a new ndarray, coercing `values` to `dtype`, preserving NA. @@ -1424,7 +1426,7 @@ def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): return subarr -def maybe_cast_to_integer_array(arr, dtype, copy=False): +def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5180f513dfed0..783669688ea42 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -6,7 +6,6 @@ from pandas._libs import algos, lib from pandas._libs.tslibs import conversion -from pandas.compat import PY36 from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -222,7 +221,7 @@ def classes_and_not_datetimelike(*klasses) -> Callable: ) -def is_object_dtype(arr_or_dtype): +def is_object_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the object dtype. @@ -252,7 +251,7 @@ def is_object_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes(np.object_)) -def is_sparse(arr): +def is_sparse(arr) -> bool: """ Check whether an array-like is a 1-D pandas sparse array. @@ -304,7 +303,7 @@ def is_sparse(arr): return isinstance(dtype, SparseDtype) -def is_scipy_sparse(arr): +def is_scipy_sparse(arr) -> bool: """ Check whether an array-like is a scipy.sparse.spmatrix instance. @@ -339,6 +338,7 @@ def is_scipy_sparse(arr): except ImportError: _is_scipy_sparse = lambda _: False + assert _is_scipy_sparse is not None return _is_scipy_sparse(arr) @@ -375,57 +375,7 @@ def is_categorical(arr) -> bool: return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) -def is_datetimetz(arr): - """ - Check whether an array-like is a datetime array-like with a timezone - component in its dtype. - - .. deprecated:: 0.24.0 - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a datetime array-like with a - timezone component in its dtype. - - Examples - -------- - >>> is_datetimetz([1, 2, 3]) - False - - Although the following examples are both DatetimeIndex objects, - the first one returns False because it has no timezone component - unlike the second one, which returns True. - - >>> is_datetimetz(pd.DatetimeIndex([1, 2, 3])) - False - >>> is_datetimetz(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) - True - - The object need not be a DatetimeIndex object. It just needs to have - a dtype which has a timezone component. - - >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern") - >>> s = pd.Series([], dtype=dtype) - >>> is_datetimetz(s) - True - """ - - warnings.warn( - "'is_datetimetz' is deprecated and will be removed in a " - "future version. Use 'is_datetime64tz_dtype' instead.", - FutureWarning, - stacklevel=2, - ) - return is_datetime64tz_dtype(arr) - - -def is_offsetlike(arr_or_obj): +def is_offsetlike(arr_or_obj) -> bool: """ Check if obj or all elements of list-like is DateOffset @@ -456,44 +406,7 @@ def is_offsetlike(arr_or_obj): return False -def is_period(arr): - """ - Check whether an array-like is a periodical index. - - .. deprecated:: 0.24.0 - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a periodical index. - - Examples - -------- - >>> is_period([1, 2, 3]) - False - >>> is_period(pd.Index([1, 2, 3])) - False - >>> is_period(pd.PeriodIndex(["2017-01-01"], freq="D")) - True - """ - - warnings.warn( - "'is_period' is deprecated and will be removed in a future " - "version. Use 'is_period_dtype' or is_period_arraylike' " - "instead.", - FutureWarning, - stacklevel=2, - ) - - return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr) - - -def is_datetime64_dtype(arr_or_dtype): +def is_datetime64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the datetime64 dtype. @@ -524,7 +437,7 @@ def is_datetime64_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes(np.datetime64)) -def is_datetime64tz_dtype(arr_or_dtype): +def is_datetime64tz_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of a DatetimeTZDtype dtype. @@ -562,7 +475,7 @@ def is_datetime64tz_dtype(arr_or_dtype): return DatetimeTZDtype.is_dtype(arr_or_dtype) -def is_timedelta64_dtype(arr_or_dtype): +def is_timedelta64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the timedelta64 dtype. @@ -593,7 +506,7 @@ def is_timedelta64_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes(np.timedelta64)) -def is_period_dtype(arr_or_dtype): +def is_period_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Period dtype. @@ -627,7 +540,7 @@ def is_period_dtype(arr_or_dtype): return PeriodDtype.is_dtype(arr_or_dtype) -def is_interval_dtype(arr_or_dtype): +def is_interval_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Interval dtype. @@ -696,7 +609,7 @@ def is_categorical_dtype(arr_or_dtype) -> bool: return CategoricalDtype.is_dtype(arr_or_dtype) -def is_string_dtype(arr_or_dtype): +def is_string_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the string dtype. @@ -726,13 +639,13 @@ def is_string_dtype(arr_or_dtype): """ # TODO: gh-15585: consider making the checks stricter. - def condition(dtype): + def condition(dtype) -> bool: return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype) return _is_dtype(arr_or_dtype, condition) -def is_period_arraylike(arr): +def is_period_arraylike(arr) -> bool: """ Check whether an array-like is a periodical array-like or PeriodIndex. @@ -764,7 +677,7 @@ def is_period_arraylike(arr): return getattr(arr, "inferred_type", None) == "period" -def is_datetime_arraylike(arr): +def is_datetime_arraylike(arr) -> bool: """ Check whether an array-like is a datetime array-like or DatetimeIndex. @@ -799,7 +712,7 @@ def is_datetime_arraylike(arr): return getattr(arr, "inferred_type", None) == "datetime" -def is_dtype_equal(source, target): +def is_dtype_equal(source, target) -> bool: """ Check if two dtypes are equal. @@ -889,7 +802,7 @@ def is_any_int_dtype(arr_or_dtype) -> bool: return _is_dtype_type(arr_or_dtype, classes(np.integer, np.timedelta64)) -def is_integer_dtype(arr_or_dtype): +def is_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of an integer dtype. @@ -944,7 +857,7 @@ def is_integer_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.integer)) -def is_signed_integer_dtype(arr_or_dtype): +def is_signed_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a signed integer dtype. @@ -1001,7 +914,7 @@ def is_signed_integer_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)) -def is_unsigned_integer_dtype(arr_or_dtype): +def is_unsigned_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of an unsigned integer dtype. @@ -1050,7 +963,7 @@ def is_unsigned_integer_dtype(arr_or_dtype): ) -def is_int64_dtype(arr_or_dtype): +def is_int64_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the int64 dtype. @@ -1141,7 +1054,7 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) -def is_datetime64_ns_dtype(arr_or_dtype): +def is_datetime64_ns_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the datetime64[ns] dtype. @@ -1191,7 +1104,7 @@ def is_datetime64_ns_dtype(arr_or_dtype): return tipo == _NS_DTYPE or getattr(tipo, "base", None) == _NS_DTYPE -def is_timedelta64_ns_dtype(arr_or_dtype): +def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the timedelta64[ns] dtype. @@ -1222,7 +1135,7 @@ def is_timedelta64_ns_dtype(arr_or_dtype): return _is_dtype(arr_or_dtype, lambda dtype: dtype == _TD_DTYPE) -def is_datetime_or_timedelta_dtype(arr_or_dtype): +def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a timedelta64 or datetime64 dtype. @@ -1265,9 +1178,6 @@ def _is_unorderable_exception(e: TypeError) -> bool: """ Check if the exception raised is an unorderable exception. - The error message differs for 3 <= PY <= 3.5 and PY >= 3.6, so - we need to condition based on Python version. - Parameters ---------- e : Exception or sub-class @@ -1275,17 +1185,13 @@ def _is_unorderable_exception(e: TypeError) -> bool: Returns ------- - boolean + bool Whether or not the exception raised is an unorderable exception. """ - - if PY36: - return "'>' not supported between instances of" in str(e) - - return "unorderable" in str(e) + return "'>' not supported between instances of" in str(e) -def needs_i8_conversion(arr_or_dtype): +def needs_i8_conversion(arr_or_dtype) -> bool: """ Check whether the array or dtype should be converted to int64. @@ -1329,7 +1235,7 @@ def needs_i8_conversion(arr_or_dtype): ) -def is_numeric_dtype(arr_or_dtype): +def is_numeric_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a numeric dtype. @@ -1372,7 +1278,7 @@ def is_numeric_dtype(arr_or_dtype): ) -def is_string_like_dtype(arr_or_dtype): +def is_string_like_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a string-like dtype. @@ -1404,7 +1310,7 @@ def is_string_like_dtype(arr_or_dtype): return _is_dtype(arr_or_dtype, lambda dtype: dtype.kind in ("S", "U")) -def is_float_dtype(arr_or_dtype): +def is_float_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a float dtype. @@ -1503,11 +1409,12 @@ def is_bool_dtype(arr_or_dtype) -> bool: return issubclass(dtype.type, np.bool_) -def is_extension_type(arr): +def is_extension_type(arr) -> bool: """ Check whether an array-like is of a pandas extension class instance. .. deprecated:: 1.0.0 + Use ``is_extension_array_dtype`` instead. Extension classes include categoricals, pandas sparse objects (i.e. classes represented within the pandas library and not ones external @@ -1567,7 +1474,7 @@ def is_extension_type(arr): return False -def is_extension_array_dtype(arr_or_dtype): +def is_extension_array_dtype(arr_or_dtype) -> bool: """ Check if an object is a pandas extension array type. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a62d3d0f4e65b..7b3e7d4f42121 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -69,7 +69,7 @@ def get_dtype_kinds(l): return typs -def concat_compat(to_concat, axis=0): +def concat_compat(to_concat, axis: int = 0): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -88,7 +88,7 @@ def concat_compat(to_concat, axis=0): # filter empty arrays # 1-d dtypes always are included here - def is_nonempty(x): + def is_nonempty(x) -> bool: if x.ndim <= axis: return True return x.shape[axis] > 0 @@ -137,7 +137,7 @@ def is_nonempty(x): return np.concatenate(to_concat, axis=axis) -def concat_categorical(to_concat, axis=0): +def concat_categorical(to_concat, axis: int = 0): """Concatenate an object/categorical array of arrays, each of which is a single dtype @@ -183,7 +183,9 @@ def concat_categorical(to_concat, axis=0): return result -def union_categoricals(to_union, sort_categories=False, ignore_order=False): +def union_categoricals( + to_union, sort_categories: bool = False, ignore_order: bool = False +): """ Combine list-like of Categorical-like, unioning categories. @@ -355,7 +357,7 @@ def _maybe_unwrap(x): return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) -def _concatenate_2d(to_concat, axis): +def _concatenate_2d(to_concat, axis: int): # coerce to 2d if needed & concatenate if axis == 1: to_concat = [np.atleast_2d(x) for x in to_concat] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 4a4ad076f14ca..523c8e8bd02d0 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -20,7 +20,7 @@ # GH26403: sentinel value used for the default value of ordered in the # CategoricalDtype constructor to detect when ordered=None is explicitly passed -ordered_sentinel = object() # type: object +ordered_sentinel: object = object() def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: @@ -51,7 +51,7 @@ def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: class Registry: """ - Registry for dtype inference + Registry for dtype inference. The registry allows one to map a string repr of a extension dtype to an extension dtype. The string alias can be used in several @@ -66,7 +66,7 @@ class Registry: """ def __init__(self): - self.dtypes = [] # type: List[Type[ExtensionDtype]] + self.dtypes: List[Type[ExtensionDtype]] = [] def register(self, dtype: Type[ExtensionDtype]) -> None: """ @@ -119,21 +119,21 @@ class PandasExtensionDtype(ExtensionDtype): THIS IS NOT A REAL NUMPY DTYPE """ - type = None # type: Any - kind = None # type: Any + type: Any + kind: Any # The Any type annotations above are here only because mypy seems to have a # problem dealing with with multiple inheritance from PandasExtensionDtype # and ExtensionDtype's @properties in the subclasses below. The kind and # type variables in those subclasses are explicitly typed below. subdtype = None - str = None # type: Optional[str_type] + str: Optional[str_type] = None num = 100 - shape = tuple() # type: Tuple[int, ...] + shape: Tuple[int, ...] = tuple() itemsize = 8 base = None isbuiltin = 0 isnative = 0 - _cache = {} # type: Dict[str_type, 'PandasExtensionDtype'] + _cache: Dict[str_type, "PandasExtensionDtype"] = {} def __str__(self) -> str_type: """ @@ -214,12 +214,12 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): # TODO: Document public vs. private API name = "category" - type = CategoricalDtypeType # type: Type[CategoricalDtypeType] - kind = "O" # type: str_type + type: Type[CategoricalDtypeType] = CategoricalDtypeType + kind: str_type = "O" str = "|O08" base = np.dtype("O") _metadata = ("categories", "ordered", "_ordered_from_sentinel") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __init__( self, categories=None, ordered: Union[Ordered, object] = ordered_sentinel @@ -416,12 +416,12 @@ def __eq__(self, other: Any) -> bool: return hash(self) == hash(other) def __repr__(self) -> str_type: - tpl = "CategoricalDtype(categories={}ordered={})" + tpl = "CategoricalDtype(categories={data}ordered={ordered})" if self.categories is None: data = "None, " else: data = self.categories._format_data(name=self.__class__.__name__) - return tpl.format(data, self._ordered) + return tpl.format(data=data, ordered=self._ordered) @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: @@ -650,15 +650,15 @@ class DatetimeTZDtype(PandasExtensionDtype): datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')] """ - type = Timestamp # type: Type[Timestamp] - kind = "M" # type: str_type + type: Type[Timestamp] = Timestamp + kind: str_type = "M" str = "|M8[ns]" num = 101 base = np.dtype("M8[ns]") na_value = NaT _metadata = ("unit", "tz") _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __init__(self, unit="ns", tz=None): if isinstance(unit, DatetimeTZDtype): @@ -719,7 +719,7 @@ def construct_array_type(cls): return DatetimeArray @classmethod - def construct_from_string(cls, string): + def construct_from_string(cls, string: str_type): """ Construct a DatetimeTZDtype from a string. @@ -736,7 +736,7 @@ def construct_from_string(cls, string): datetime64[ns, UTC] """ if isinstance(string, str): - msg = "Could not construct DatetimeTZDtype from '{}'" + msg = "Could not construct DatetimeTZDtype from '{string}'" match = cls._match.match(string) if match: d = match.groupdict() @@ -747,8 +747,8 @@ def construct_from_string(cls, string): # pytz timezone (actually pytz.UnknownTimeZoneError). # TypeError if we pass a nonsense tz; # ValueError if we pass a unit other than "ns" - raise TypeError(msg.format(string)) from err - raise TypeError(msg.format(string)) + raise TypeError(msg.format(string=string)) from err + raise TypeError(msg.format(string=string)) raise TypeError("Could not construct DatetimeTZDtype") @@ -756,11 +756,11 @@ def __str__(self) -> str_type: return "datetime64[{unit}, {tz}]".format(unit=self.unit, tz=self.tz) @property - def name(self): + def name(self) -> str_type: """A string representation of the dtype.""" return str(self) - def __hash__(self): + def __hash__(self) -> int: # make myself hashable # TODO: update this. return hash(str(self)) @@ -812,14 +812,14 @@ class PeriodDtype(PandasExtensionDtype): period[M] """ - type = Period # type: Type[Period] - kind = "O" # type: str_type + type: Type[Period] = Period + kind: str_type = "O" str = "|O08" base = np.dtype("O") num = 102 _metadata = ("freq",) _match = re.compile(r"(P|p)eriod\[(?P.+)\]") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __new__(cls, freq=None): """ @@ -893,14 +893,14 @@ def __str__(self) -> str_type: return self.name @property - def name(self): + def name(self) -> str_type: return "period[{freq}]".format(freq=self.freq.freqstr) @property def na_value(self): return NaT - def __hash__(self): + def __hash__(self) -> int: # make myself hashable return hash(str(self)) @@ -917,7 +917,7 @@ def __setstate__(self, state): self._freq = state["freq"] @classmethod - def is_dtype(cls, dtype): + def is_dtype(cls, dtype) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) @@ -972,13 +972,13 @@ class IntervalDtype(PandasExtensionDtype): """ name = "interval" - kind = None # type: Optional[str_type] + kind: Optional[str_type] = None str = "|O08" base = np.dtype("O") num = 103 _metadata = ("subtype",) _match = re.compile(r"(I|i)nterval\[(?P.+)\]") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __new__(cls, subtype=None): from pandas.core.dtypes.common import ( @@ -1073,7 +1073,7 @@ def __str__(self) -> str_type: return "interval" return "interval[{subtype}]".format(subtype=self.subtype) - def __hash__(self): + def __hash__(self) -> int: # make myself hashable return hash(str(self)) @@ -1097,7 +1097,7 @@ def __setstate__(self, state): self._subtype = state["subtype"] @classmethod - def is_dtype(cls, dtype): + def is_dtype(cls, dtype) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 2518f330b26a3..aa0f7d2aba1fc 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -5,7 +5,7 @@ # objects def create_pandas_abc_type(name, attr, comp): @classmethod - def _check(cls, inst): + def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp dct = dict(__instancecheck__=_check, __subclasscheck__=_check) @@ -74,7 +74,7 @@ def _check(cls, inst): class _ABCGeneric(type): - def __instancecheck__(cls, inst): + def __instancecheck__(cls, inst) -> bool: return hasattr(inst, "_data") diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 61fa7940c1bce..9e9278052e35d 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -26,7 +26,7 @@ is_list_like = lib.is_list_like -def is_number(obj): +def is_number(obj) -> bool: """ Check if the object is a number. @@ -67,7 +67,7 @@ def is_number(obj): return isinstance(obj, (Number, np.number)) -def _iterable_not_string(obj): +def _iterable_not_string(obj) -> bool: """ Check if the object is an iterable but not a string. @@ -93,7 +93,7 @@ def _iterable_not_string(obj): return isinstance(obj, abc.Iterable) and not isinstance(obj, str) -def is_iterator(obj): +def is_iterator(obj) -> bool: """ Check if the object is an iterator. @@ -127,7 +127,7 @@ def is_iterator(obj): return hasattr(obj, "__next__") -def is_file_like(obj): +def is_file_like(obj) -> bool: """ Check if the object is a file-like object. @@ -165,7 +165,7 @@ def is_file_like(obj): return True -def is_re(obj): +def is_re(obj) -> bool: """ Check if the object is a regex pattern instance. @@ -188,7 +188,7 @@ def is_re(obj): return isinstance(obj, Pattern) -def is_re_compilable(obj): +def is_re_compilable(obj) -> bool: """ Check if the object can be compiled into a regex pattern instance. @@ -217,7 +217,7 @@ def is_re_compilable(obj): return True -def is_array_like(obj): +def is_array_like(obj) -> bool: """ Check if the object is array-like. @@ -250,7 +250,7 @@ def is_array_like(obj): return is_list_like(obj) and hasattr(obj, "dtype") -def is_nested_list_like(obj): +def is_nested_list_like(obj) -> bool: """ Check if the object is list-like, and that all of its elements are also list-like. @@ -296,7 +296,7 @@ def is_nested_list_like(obj): ) -def is_dict_like(obj): +def is_dict_like(obj) -> bool: """ Check if the object is dict-like. @@ -328,7 +328,7 @@ def is_dict_like(obj): ) -def is_named_tuple(obj): +def is_named_tuple(obj) -> bool: """ Check if the object is a named tuple. @@ -355,7 +355,7 @@ def is_named_tuple(obj): return isinstance(obj, tuple) and hasattr(obj, "_fields") -def is_hashable(obj): +def is_hashable(obj) -> bool: """ Return True if hash(obj) will succeed, False otherwise. @@ -392,7 +392,7 @@ def is_hashable(obj): return True -def is_sequence(obj): +def is_sequence(obj) -> bool: """ Check if the object is a sequence of objects. String types are not included as sequences here. diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 0a8f636b4cb2a..25d6f87143d72 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -157,7 +157,8 @@ def _isna_new(obj): def _isna_old(obj): - """Detect missing values. Treat None, NaN, INF, -INF as null. + """ + Detect missing values, treating None, NaN, INF, -INF as null. Parameters ---------- @@ -190,7 +191,9 @@ def _isna_old(obj): def _use_inf_as_na(key): - """Option change callback for na/inf behaviour + """ + Option change callback for na/inf behaviour. + Choose which replacement for numpy.isnan / -numpy.isfinite is used. Parameters @@ -372,7 +375,7 @@ def notna(obj): notnull = notna -def _isna_compat(arr, fill_value=np.nan): +def _isna_compat(arr, fill_value=np.nan) -> bool: """ Parameters ---------- @@ -389,7 +392,7 @@ def _isna_compat(arr, fill_value=np.nan): return True -def array_equivalent(left, right, strict_nan=False): +def array_equivalent(left, right, strict_nan: bool = False) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and @@ -445,7 +448,7 @@ def array_equivalent(left, right, strict_nan=False): return False else: try: - if np.any(left_value != right_value): + if np.any(np.asarray(left_value != right_value)): return False except TypeError as err: if "Cannot compare tz-naive" in str(err): @@ -508,7 +511,7 @@ def _maybe_fill(arr, fill_value=np.nan): return arr -def na_value_for_dtype(dtype, compat=True): +def na_value_for_dtype(dtype, compat: bool = True): """ Return a dtype compat na value @@ -566,7 +569,7 @@ def remove_na_arraylike(arr): return arr[notna(lib.values_from_object(arr))] -def is_valid_nat_for_dtype(obj, dtype): +def is_valid_nat_for_dtype(obj, dtype) -> bool: """ isna check that excludes incompatible dtypes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7e3c2200dbabc..46b213b25df49 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -34,14 +34,8 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib -from pandas.compat import PY36 from pandas.compat.numpy import function as nv -from pandas.util._decorators import ( - Appender, - Substitution, - deprecate_kwarg, - rewrite_axis_style_signature, -) +from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature from pandas.util._validators import ( validate_axis_style_args, validate_bool_kwarg, @@ -67,7 +61,6 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, - is_datetime64_any_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -78,7 +71,6 @@ is_iterator, is_list_like, is_named_tuple, - is_nested_list_like, is_object_dtype, is_scalar, is_sequence, @@ -344,8 +336,9 @@ class DataFrame(NDFrame): -------- DataFrame.from_records : Constructor from tuples, also record arrays. DataFrame.from_dict : From dicts of Series, arrays, or dicts. - DataFrame.from_items : From sequence of (key, value) pairs - read_csv, pandas.read_table, pandas.read_clipboard. + read_csv + read_table + read_clipboard Examples -------- @@ -388,11 +381,9 @@ class DataFrame(NDFrame): def _constructor(self) -> Type["DataFrame"]: return DataFrame - _constructor_sliced = Series # type: Type[Series] - _deprecations = NDFrame._deprecations | frozenset( - ["from_items"] - ) # type: FrozenSet[str] - _accessors = set() # type: Set[str] + _constructor_sliced: Type[Series] = Series + _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) + _accessors: Set[str] = set() @property def _constructor_expanddim(self): @@ -857,9 +848,9 @@ def style(self): ... index=['panda', 'polar', 'koala']) >>> df species population - panda bear 1864 - polar bear 22000 - koala marsupial 80000 + panda bear 1864 + polar bear 22000 + koala marsupial 80000 >>> for label, content in df.items(): ... print('label:', label) ... print('content:', content, sep='\n') @@ -1023,7 +1014,7 @@ def itertuples(self, index=True, name="Pandas"): # fallback to regular tuples return zip(*arrays) - def __len__(self): + def __len__(self) -> int: """ Returns length of info axis, but here we use the index. """ @@ -1686,9 +1677,7 @@ def from_records( return cls(mgr) - def to_records( - self, index=True, convert_datetime64=None, column_dtypes=None, index_dtypes=None - ): + def to_records(self, index=True, column_dtypes=None, index_dtypes=None): """ Convert DataFrame to a NumPy record array. @@ -1700,11 +1689,6 @@ def to_records( index : bool, default True Include index in resulting record array, stored in 'index' field or using the index label, if set. - convert_datetime64 : bool, default None - .. deprecated:: 0.23.0 - - Whether to convert the index to datetime.datetime if it is a - DatetimeIndex. column_dtypes : str, type, dict, default None .. versionadded:: 0.24.0 @@ -1779,24 +1763,12 @@ def to_records( dtype=[('I', 'S1'), ('A', '` - instead. - :meth:`DataFrame.from_dict(OrderedDict(items)) ` - may be used to preserve the key order. - - Convert (key, value) pairs to DataFrame. The keys will be the axis - index (usually the columns, but depends on the specified - orientation). The values should be arrays or Series. - - Parameters - ---------- - items : sequence of (key, value) pairs - Values should be arrays or Series. - columns : sequence of column labels, optional - Must be passed if orient='index'. - orient : {'columns', 'index'}, default 'columns' - The "orientation" of the data. If the keys of the - input correspond to column labels, pass 'columns' - (default). Otherwise if the keys correspond to the index, - pass 'index'. - - Returns - ------- - DataFrame - """ - - warnings.warn( - "from_items is deprecated. Please use " - "DataFrame.from_dict(dict(items), ...) instead. " - "DataFrame.from_dict(OrderedDict(items)) may be used to " - "preserve the key order.", - FutureWarning, - stacklevel=2, - ) - - keys, values = zip(*items) - - if orient == "columns": - if columns is not None: - columns = ensure_index(columns) - - idict = dict(items) - if len(idict) < len(items): - if not columns.equals(ensure_index(keys)): - raise ValueError( - "With non-unique item names, passed " - "columns must be identical" - ) - arrays = values - else: - arrays = [idict[k] for k in columns if k in idict] - else: - columns = ensure_index(keys) - arrays = values - - # GH 17312 - # Provide more informative error msg when scalar values passed - try: - return cls._from_arrays(arrays, columns, None) - - except ValueError: - if not is_nested_list_like(values): - raise ValueError( - "The value in each (key, value) pair " - "must be an array, Series, or dict" - ) - - elif orient == "index": - if columns is None: - raise TypeError("Must pass columns with orient='index'") - - keys = ensure_index(keys) - - # GH 17312 - # Provide more informative error msg when scalar values passed - try: - arr = np.array(values, dtype=object).T - data = [lib.maybe_convert_objects(v) for v in arr] - return cls._from_arrays(data, columns, keys) - - except TypeError: - if not is_nested_list_like(values): - raise ValueError( - "The value in each (key, value) pair " - "must be an array, Series, or dict" - ) - - else: # pragma: no cover - raise ValueError("'orient' must be either 'columns' or 'index'") - @classmethod def _from_arrays(cls, arrays, columns, index, dtype=None): mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) def to_stata( self, fname, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, @@ -2009,8 +1882,6 @@ def to_stata( a datetime column has timezone information. write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Unicode is not supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder`. time_stamp : datetime @@ -2083,7 +1954,7 @@ def to_stata( data_label=data_label, write_index=write_index, variable_labels=variable_labels, - **kwargs + **kwargs, ) writer.write_file() @@ -2107,7 +1978,7 @@ def to_parquet( compression="snappy", index=None, partition_cols=None, - **kwargs + **kwargs, ): """ Write a DataFrame to the binary parquet format. @@ -2187,7 +2058,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, - **kwargs + **kwargs, ) @Substitution( @@ -2455,7 +2326,7 @@ def info( exceeds_info_cols = len(self.columns) > max_cols def _verbose_repr(): - lines.append("Data columns (total %d columns):" % len(self.columns)) + lines.append(f"Data columns (total {len(self.columns)} columns):") space = max(len(pprint_thing(k)) for k in self.columns) + 4 counts = None @@ -2847,7 +2718,7 @@ def _getitem_bool_array(self, key): ) elif len(key) != len(self.index): raise ValueError( - "Item wrong length %d instead of %d." % (len(key), len(self.index)) + f"Item wrong length {len(key)} instead of {len(self.index)}." ) # check_bool_indexer will throw exception if Series key cannot @@ -2958,7 +2829,7 @@ def _setitem_array(self, key, value): if com.is_bool_indexer(key): if len(key) != len(self.index): raise ValueError( - "Item wrong length %d instead of %d!" % (len(key), len(self.index)) + f"Item wrong length {len(key)} instead of {len(self.index)}!" ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] @@ -3498,16 +3369,12 @@ def assign(self, **kwargs): Notes ----- Assigning multiple columns within the same ``assign`` is possible. - For Python 3.6 and above, later items in '\*\*kwargs' may refer to - newly created or modified columns in 'df'; items are computed and - assigned into 'df' in order. For Python 3.5 and below, the order of - keyword arguments is not specified, you cannot refer to newly created - or modified columns. All items are computed first, and then assigned - in alphabetical order. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. .. versionchanged:: 0.23.0 - Keyword argument order is maintained for Python 3.6 and later. + Keyword argument order is maintained. Examples -------- @@ -3533,9 +3400,8 @@ def assign(self, **kwargs): Portland 17.0 62.6 Berkeley 25.0 77.0 - In Python 3.6+, you can create multiple columns within the same assign - where one of the columns depends on another one defined within the same - assign: + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) @@ -3545,21 +3411,8 @@ def assign(self, **kwargs): """ data = self.copy() - # >= 3.6 preserve order of kwargs - if PY36: - for k, v in kwargs.items(): - data[k] = com.apply_if_callable(v, data) - else: - # <= 3.5: do all calculations first... - results = OrderedDict() - for k, v in kwargs.items(): - results[k] = com.apply_if_callable(v, data) - - # <= 3.5 and earlier - results = sorted(results.items()) - # ... and then assign - for k, v in results: - data[k] = v + for k, v in kwargs.items(): + data[k] = com.apply_if_callable(v, data) return data def _sanitize_column(self, key, value, broadcast=True): @@ -4129,7 +3982,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs + **kwargs, ): return super().fillna( value=value, @@ -4138,7 +3991,7 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs + **kwargs, ) @Appender(_shared_docs["replace"] % _shared_doc_kwargs) @@ -4300,7 +4153,7 @@ def set_index( arrays = [] names = [] if append: - names = [x for x in self.index.names] + names = list(self.index.names) if isinstance(self.index, ABCMultiIndex): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) @@ -4556,8 +4409,8 @@ def _maybe_casted_values(index, labels=None): if not drop: if isinstance(self.index, ABCMultiIndex): names = [ - n if n is not None else ("level_%d" % i) - for (i, n) in enumerate(self.index.names) + (n if n is not None else f"level_{i}") + for i, n in enumerate(self.index.names) ] to_insert = zip(self.index.levels, self.index.codes) else: @@ -4793,7 +4646,7 @@ def drop_duplicates(self, subset=None, keep="first", inplace=False): duplicated = self.duplicated(subset, keep=keep) if inplace: - inds, = (-duplicated)._ndarray_values.nonzero() + (inds,) = (-duplicated)._ndarray_values.nonzero() new_data = self._data.take(inds) self._update_inplace(new_data) else: @@ -4877,8 +4730,7 @@ def sort_values( by = [by] if is_sequence(ascending) and len(by) != len(ascending): raise ValueError( - "Length of ascending (%d) != length of by (%d)" - % (len(ascending), len(by)) + f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" ) if len(by) > 1: from pandas.core.sorting import lexsort_indexer @@ -5548,11 +5400,6 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) - @deprecate_kwarg( - old_arg_name="raise_conflict", - new_arg_name="errors", - mapping={False: "ignore", True: "raise"}, - ) def update( self, other, join="left", overwrite=True, filter_func=None, errors="ignore" ): @@ -6585,7 +6432,7 @@ def _gotitem( see_also=_agg_summary_and_see_also_doc, examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", - **_shared_doc_kwargs + **_shared_doc_kwargs, ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): @@ -6618,9 +6465,7 @@ def transform(self, func, axis=0, *args, **kwargs): return self.T.transform(func, *args, **kwargs).T return super().transform(func, *args, **kwargs) - def apply( - self, func, axis=0, raw=False, reduce=None, result_type=None, args=(), **kwds - ): + def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ Apply a function along an axis of the DataFrame. @@ -6962,10 +6807,13 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=None): other = other._convert(datetime=True, timedelta=True) if not self.columns.equals(combined_columns): self = self.reindex(columns=combined_columns) - elif isinstance(other, list) and not isinstance(other[0], DataFrame): - other = DataFrame(other) - if (self.columns.get_indexer(other.columns) >= 0).all(): - other = other.reindex(columns=self.columns) + elif isinstance(other, list): + if not other: + pass + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) + if (self.columns.get_indexer(other.columns) >= 0).all(): + other = other.reindex(columns=self.columns) from pandas.core.reshape.concat import concat @@ -7287,7 +7135,7 @@ def _series_round(s, decimals): if isinstance(decimals, Series): if not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") - new_cols = [col for col in _dict_round(self, decimals)] + new_cols = list(_dict_round(self, decimals)) elif is_integer(decimals): # Dispatch to Series.round new_cols = [_series_round(v, decimals) for _, v in self.items()] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 49df374670577..2e2ae4e1dfa0a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8,7 +8,6 @@ import re from textwrap import dedent from typing import ( - TYPE_CHECKING, Any, Callable, Dict, @@ -93,7 +92,7 @@ # goal is to be able to define the docs close to function, while still being # able to share -_shared_docs = dict() # type: Dict[str, str] +_shared_docs: Dict[str, str] = dict() _shared_doc_kwargs = dict( axes="keywords for axes", klass="Series/DataFrame", @@ -155,7 +154,7 @@ class NDFrame(PandasObject, SelectionMixin): copy : bool, default False """ - _internal_names = [ + _internal_names: List[str] = [ "_data", "_cacher", "_item_cache", @@ -169,33 +168,16 @@ class NDFrame(PandasObject, SelectionMixin): "_metadata", "__array_struct__", "__array_interface__", - ] # type: List[str] - _internal_names_set = set(_internal_names) # type: Set[str] - _accessors = set() # type: Set[str] - _deprecations = frozenset( - [ - "as_blocks", - "as_matrix", - "blocks", - "clip_lower", - "clip_upper", - "get_dtype_counts", - "get_ftype_counts", - "get_values", - "is_copy", - "ftypes", - "ix", - ] - ) # type: FrozenSet[str] - _metadata = [] # type: List[str] + ] + _internal_names_set: Set[str] = set(_internal_names) + _accessors: Set[str] = set() + _deprecations: FrozenSet[str] = frozenset( + ["get_dtype_counts", "get_values", "ftypes", "ix"] + ) + _metadata: List[str] = [] _is_copy = None - _data = None # type: BlockManager - - if TYPE_CHECKING: - # TODO(PY36): replace with _attrs : Dict[Hashable, Any] - # We need the TYPE_CHECKING, because _attrs is not a class attribute - # and Py35 doesn't support the new syntax. - _attrs = {} # type: Dict[Optional[Hashable], Any] + _data: BlockManager + _attrs: Dict[Optional[Hashable], Any] # ---------------------------------------------------------------------- # Constructors @@ -261,29 +243,6 @@ def attrs(self) -> Dict[Optional[Hashable], Any]: def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) - @property - def is_copy(self): - """ - Return the copy. - """ - warnings.warn( - "Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", - FutureWarning, - stacklevel=2, - ) - return self._is_copy - - @is_copy.setter - def is_copy(self, msg): - warnings.warn( - "Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", - FutureWarning, - stacklevel=2, - ) - self._is_copy = msg - def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -427,8 +386,7 @@ def _construct_axes_from_arguments( if a in kwargs: if alias in kwargs: raise TypeError( - "arguments are mutually exclusive " - "for [%s,%s]" % (a, alias) + f"arguments are mutually exclusive for [{a},{alias}]" ) continue if alias in kwargs: @@ -569,7 +527,7 @@ def axes(self): return [self._get_axis(a) for a in self._AXIS_ORDERS] @property - def ndim(self): + def ndim(self) -> int: """ Return an int representing the number of axes / array dimensions. @@ -760,7 +718,7 @@ def transpose(self, *args, **kwargs): # we must have unique axes if len(axes) != len(set(axes)): - raise ValueError("Must specify %s unique axes" % self._AXIS_LEN) + raise ValueError(f"Must specify {self._AXIS_LEN} unique axes") new_axes = self._construct_axes_dict_from( self, [self._get_axis(x) for x in axes_names] @@ -814,7 +772,8 @@ def droplevel(self, level, axis=0): Returns ------- - DataFrame.droplevel() + DataFrame + DataFrame with requested index / column level(s) removed. Examples -------- @@ -1952,7 +1911,7 @@ def items(self): def iteritems(self): return self.items() - def __len__(self): + def __len__(self) -> int: """Returns length of info axis""" return len(self._info_axis) @@ -2061,7 +2020,7 @@ def __getstate__(self): _typ=self._typ, _metadata=self._metadata, attrs=self.attrs, - **meta + **meta, ) def __setstate__(self, state): @@ -2102,7 +2061,7 @@ def __repr__(self) -> str: # string representation based upon iterating over self # (since, by definition, `PandasContainers` are iterable) prepr = "[%s]" % ",".join(map(pprint_thing, self)) - return "%s(%s)" % (self.__class__.__name__, prepr) + return f"{self.__class__.__name__}({prepr})" def _repr_latex_(self): """ @@ -2637,7 +2596,11 @@ def to_sql( Name of SQL table. con : sqlalchemy.engine.Engine or sqlite3.Connection Using SQLAlchemy makes it possible to use any DB supported by that - library. Legacy support is provided for sqlite3.Connection objects. + library. Legacy support is provided for sqlite3.Connection objects. The user + is responsible for engine disposal and connection closure for the SQLAlchemy + connectable See `here \ + `_ + schema : str, optional Specify the schema (if database flavor supports this). If None, use default schema. @@ -3605,7 +3568,7 @@ class animal locomotion if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: - inds, = loc.nonzero() + (inds,) = loc.nonzero() return self.take(inds, axis=axis) else: return self.take(loc, axis=axis) @@ -3640,7 +3603,7 @@ class animal locomotion result._set_is_copy(self, copy=not result._is_view) return result - _xs = xs # type: Callable + _xs: Callable = xs def __getitem__(self, item): raise AbstractMethodError(self) @@ -5415,54 +5378,6 @@ def _get_bool_data(self): # ---------------------------------------------------------------------- # Internal Interface Methods - def as_matrix(self, columns=None): - """ - Convert the frame to its Numpy-array representation. - - .. deprecated:: 0.23.0 - Use :meth:`DataFrame.values` instead. - - Parameters - ---------- - columns : list, optional, default:None - If None, return all columns, otherwise, returns specified columns. - - Returns - ------- - values : ndarray - If the caller is heterogeneous and contains booleans or objects, - the result will be of dtype=object. See Notes. - - See Also - -------- - DataFrame.values - - Notes - ----- - Return is NOT a Numpy-matrix, rather, a Numpy-array. - - The dtype will be a lower-common-denominator dtype (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. Use this - with care if you are not dealing with the blocks. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcase to - int32. By numpy.find_common_type convention, mixing int64 and uint64 - will result in a float64 dtype. - - This method is provided for backwards compatibility. Generally, - it is recommended to use '.values'. - """ - warnings.warn( - "Method .as_matrix will be removed in a future version. " - "Use .values instead.", - FutureWarning, - stacklevel=2, - ) - self._consolidate_inplace() - return self._data.as_array(transpose=self._AXIS_REVERSED, items=columns) - @property def values(self): """ @@ -5652,49 +5567,6 @@ def get_dtype_counts(self): return Series(self._data.get_dtype_counts()) - def get_ftype_counts(self): - """ - Return counts of unique ftypes in this object. - - .. deprecated:: 0.23.0 - - Returns - ------- - dtype : Series - Series with the count of columns with each type and - sparsity (dense/sparse). - - See Also - -------- - ftypes : Return ftypes (indication of sparse/dense and dtype) in - this object. - - Examples - -------- - >>> a = [['a', 1, 1.0], ['b', 2, 2.0], ['c', 3, 3.0]] - >>> df = pd.DataFrame(a, columns=['str', 'int', 'float']) - >>> df - str int float - 0 a 1 1.0 - 1 b 2 2.0 - 2 c 3 3.0 - - >>> df.get_ftype_counts() # doctest: +SKIP - float64:dense 1 - int64:dense 1 - object:dense 1 - dtype: int64 - """ - warnings.warn( - "get_ftype_counts is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - - from pandas import Series - - return Series(self._data.get_ftype_counts()) - @property def dtypes(self): """ @@ -5780,40 +5652,6 @@ def ftypes(self): return Series(self._data.get_ftypes(), index=self._info_axis, dtype=np.object_) - def as_blocks(self, copy=True): - """ - Convert the frame to a dict of dtype -> Constructor Types. - - .. deprecated:: 0.21.0 - - NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in - as_matrix) - - Parameters - ---------- - copy : bool, default True - - Returns - ------- - dict - Mapping dtype -> Constructor Types. - """ - warnings.warn( - "as_blocks is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - return self._to_dict_of_blocks(copy=copy) - - @property - def blocks(self): - """ - Internal property, property synonym for as_blocks(). - - .. deprecated:: 0.21.0 - """ - return self.as_blocks() - def _to_dict_of_blocks(self, copy=True): """ Return a dict of dtype -> Constructor Types that @@ -6363,7 +6201,7 @@ def fillna( elif isinstance(value, ABCDataFrame) and self.ndim == 2: new_data = self.where(self.notna(), value) else: - raise ValueError("invalid fill value with a %s" % type(value)) + raise ValueError(f"invalid fill value with a {type(value)}") if inplace: self._update_inplace(new_data) @@ -6800,9 +6638,8 @@ def replace( if is_list_like(value): if len(to_replace) != len(value): raise ValueError( - "Replacement lists must match " - "in length. Expecting %d got %d " - % (len(to_replace), len(value)) + f"Replacement lists must match in length. " + f"Expecting {len(to_replace)} got {len(value)} " ) new_data = self._data.replace_list( @@ -7056,7 +6893,7 @@ def interpolate( limit_direction="forward", limit_area=None, downcast=None, - **kwargs + **kwargs, ): """ Interpolate values according to different methods. @@ -7130,7 +6967,7 @@ def interpolate( limit_area=limit_area, inplace=inplace, downcast=downcast, - **kwargs + **kwargs, ) if inplace: @@ -7618,208 +7455,6 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs return result - def clip_upper(self, threshold, axis=None, inplace=False): - """ - Trim values above a given threshold. - - .. deprecated:: 0.24.0 - Use clip(upper=threshold) instead. - - Elements above the `threshold` will be changed to match the - `threshold` value(s). Threshold can be a single value or an array, - in the latter case it performs the truncation element-wise. - - Parameters - ---------- - threshold : numeric or array-like - Maximum value allowed. All values above threshold will be set to - this value. - - * float : every value is compared to `threshold`. - * array-like : The shape of `threshold` should match the object - it's compared to. When `self` is a Series, `threshold` should be - the length. When `self` is a DataFrame, `threshold` should 2-D - and the same shape as `self` for ``axis=None``, or 1-D and the - same length as the axis being compared. - - axis : {0 or 'index', 1 or 'columns'}, default 0 - Align object with `threshold` along the given axis. - inplace : bool, default False - Whether to perform the operation in place on the data. - - .. versionadded:: 0.21.0 - - Returns - ------- - Series or DataFrame - Original data with values trimmed. - - See Also - -------- - Series.clip : General purpose method to trim Series values to given - threshold(s). - DataFrame.clip : General purpose method to trim DataFrame values to - given threshold(s). - - Examples - -------- - >>> s = pd.Series([1, 2, 3, 4, 5]) - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - dtype: int64 - - >>> s.clip(upper=3) - 0 1 - 1 2 - 2 3 - 3 3 - 4 3 - dtype: int64 - - >>> elemwise_thresholds = [5, 4, 3, 2, 1] - >>> elemwise_thresholds - [5, 4, 3, 2, 1] - - >>> s.clip(upper=elemwise_thresholds) - 0 1 - 1 2 - 2 3 - 3 2 - 4 1 - dtype: int64 - """ - warnings.warn( - "clip_upper(threshold) is deprecated, use clip(upper=threshold) instead", - FutureWarning, - stacklevel=2, - ) - return self._clip_with_one_bound( - threshold, method=self.le, axis=axis, inplace=inplace - ) - - def clip_lower(self, threshold, axis=None, inplace=False): - """ - Trim values below a given threshold. - - .. deprecated:: 0.24.0 - Use clip(lower=threshold) instead. - - Elements below the `threshold` will be changed to match the - `threshold` value(s). Threshold can be a single value or an array, - in the latter case it performs the truncation element-wise. - - Parameters - ---------- - threshold : numeric or array-like - Minimum value allowed. All values below threshold will be set to - this value. - - * float : every value is compared to `threshold`. - * array-like : The shape of `threshold` should match the object - it's compared to. When `self` is a Series, `threshold` should be - the length. When `self` is a DataFrame, `threshold` should 2-D - and the same shape as `self` for ``axis=None``, or 1-D and the - same length as the axis being compared. - - axis : {0 or 'index', 1 or 'columns'}, default 0 - Align `self` with `threshold` along the given axis. - - inplace : bool, default False - Whether to perform the operation in place on the data. - - .. versionadded:: 0.21.0 - - Returns - ------- - Series or DataFrame - Original data with values trimmed. - - See Also - -------- - Series.clip : General purpose method to trim Series values to given - threshold(s). - DataFrame.clip : General purpose method to trim DataFrame values to - given threshold(s). - - Examples - -------- - - Series single threshold clipping: - - >>> s = pd.Series([5, 6, 7, 8, 9]) - >>> s.clip(lower=8) - 0 8 - 1 8 - 2 8 - 3 8 - 4 9 - dtype: int64 - - Series clipping element-wise using an array of thresholds. `threshold` - should be the same length as the Series. - - >>> elemwise_thresholds = [4, 8, 7, 2, 5] - >>> s.clip(lower=elemwise_thresholds) - 0 5 - 1 8 - 2 7 - 3 8 - 4 9 - dtype: int64 - - DataFrames can be compared to a scalar. - - >>> df = pd.DataFrame({"A": [1, 3, 5], "B": [2, 4, 6]}) - >>> df - A B - 0 1 2 - 1 3 4 - 2 5 6 - - >>> df.clip(lower=3) - A B - 0 3 3 - 1 3 4 - 2 5 6 - - Or to an array of values. By default, `threshold` should be the same - shape as the DataFrame. - - >>> df.clip(lower=np.array([[3, 4], [2, 2], [6, 2]])) - A B - 0 3 4 - 1 3 4 - 2 6 6 - - Control how `threshold` is broadcast with `axis`. In this case - `threshold` should be the same length as the axis specified by - `axis`. - - >>> df.clip(lower=[3, 3, 5], axis='index') - A B - 0 3 3 - 1 3 4 - 2 5 6 - - >>> df.clip(lower=[4, 5], axis='columns') - A B - 0 4 5 - 1 4 5 - 2 5 6 - """ - warnings.warn( - "clip_lower(threshold) is deprecated, use clip(lower=threshold) instead", - FutureWarning, - stacklevel=2, - ) - return self._clip_with_one_bound( - threshold, method=self.ge, axis=axis, inplace=inplace - ) - def groupby( self, by=None, @@ -7830,7 +7465,6 @@ def groupby( group_keys=True, squeeze=False, observed=False, - **kwargs ): """ Group DataFrame or Series using a mapper or by a Series of columns. @@ -7876,10 +7510,6 @@ def groupby( .. versionadded:: 0.23.0 - **kwargs - Optional, only accepts keyword argument 'mutated' and is passed - to groupby. - Returns ------- DataFrameGroupBy or SeriesGroupBy @@ -7941,12 +7571,13 @@ def groupby( Captive 210.0 Wild 185.0 """ - from pandas.core.groupby.groupby import groupby + from pandas.core.groupby.groupby import get_groupby if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return groupby( + + return get_groupby( self, by=by, axis=axis, @@ -7956,7 +7587,6 @@ def groupby( group_keys=group_keys, squeeze=squeeze, observed=observed, - **kwargs ) def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): @@ -8882,7 +8512,7 @@ def align( fill_axis=fill_axis, ) else: # pragma: no cover - raise TypeError("unsupported type: %s" % type(other)) + raise TypeError(f"unsupported type: {type(other)}") def _align_frame( self, @@ -9526,9 +9156,9 @@ def tshift(self, periods=1, freq=None, axis=0): new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods) else: - msg = "Given freq %s does not match PeriodIndex freq %s" % ( - freq.rule_code, - orig_freq.rule_code, + msg = ( + f"Given freq {freq.rule_code} does not match" + f" PeriodIndex freq {orig_freq.rule_code}" ) raise ValueError(msg) else: @@ -9676,7 +9306,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): if before is not None and after is not None: if before > after: - raise ValueError("Truncate: %s must be after %s" % (after, before)) + raise ValueError(f"Truncate: {after} must be after {before}") slicer = [slice(None, None)] * self._AXIS_LEN slicer[axis] = slice(before, after) @@ -9722,7 +9352,7 @@ def _tz_convert(ax, tz): if len(ax) > 0: ax_name = self._get_axis_name(axis) raise TypeError( - "%s is not a valid DatetimeIndex or PeriodIndex" % ax_name + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" ) else: ax = DatetimeIndex([], tz=tz) @@ -9886,7 +9516,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): if len(ax) > 0: ax_name = self._get_axis_name(axis) raise TypeError( - "%s is not a valid DatetimeIndex or PeriodIndex" % ax_name + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" ) else: ax = DatetimeIndex([], tz=tz) @@ -10454,6 +10084,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwar data = self.fillna(method=fill_method, limit=limit, axis=axis) rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 + rs = rs.loc[~rs.index.duplicated()] rs = rs.reindex_like(data) if freq is None: mask = isna(com.values_from_object(data)) @@ -11583,7 +11214,7 @@ def stat_func( level=None, numeric_only=None, min_count=0, - **kwargs + **kwargs, ): if name == "sum": nv.validate_sum(tuple(), kwargs) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index fed387cbeade4..407cd8342d486 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -3,8 +3,12 @@ hold the whitelist of methods that are exposed on the SeriesGroupBy and the DataFrameGroupBy objects. """ +import collections + from pandas.core.dtypes.common import is_list_like, is_scalar +OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) + class GroupByMixin: """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 56a8a7d15077b..99ef281e842b1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -14,21 +14,17 @@ Any, Callable, FrozenSet, - Hashable, Iterable, - Optional, + Mapping, Sequence, - Tuple, Type, Union, cast, ) -import warnings import numpy as np from pandas._libs import Timestamp, lib -from pandas.compat import PY36 from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import ( @@ -62,9 +58,9 @@ GroupBy, _apply_docs, _transform_template, - groupby, + get_groupby, ) -from pandas.core.index import Index, MultiIndex, _all_indexes_same +from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series @@ -143,8 +139,8 @@ def pinner(cls): class SeriesGroupBy(GroupBy): _apply_whitelist = base.series_apply_whitelist - def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: - yield self._selection_name, self._selected_obj + def _iterate_slices(self) -> Iterable[Series]: + yield self._selected_obj @property def _selection_name(self): @@ -226,17 +222,12 @@ def apply(self, func, *args, **kwargs): ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - _level = kwargs.pop("_level", None) relabeling = func is None columns = None no_arg_message = "Must provide 'func' or named aggregation **kwargs." if relabeling: columns = list(kwargs) - if not PY36: - # sort for 3.5 and earlier - columns = list(sorted(columns)) - func = [kwargs[col] for col in columns] kwargs = {} if not columns: @@ -249,7 +240,7 @@ def aggregate(self, func=None, *args, **kwargs): # Catch instances of lists / tuples # but not the class list / tuple itself. func = _maybe_mangle_lambdas(func) - ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) + ret = self._aggregate_multiple_funcs(func) if relabeling: ret.columns = columns else: @@ -262,10 +253,7 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) - except (ValueError, KeyError, AttributeError, IndexError): - # TODO: IndexError can be removed here following GH#29106 - # TODO: AttributeError is caused by _index_data hijinx in - # libreduction, can be removed after GH#29160 + except (ValueError, KeyError): # TODO: KeyError is raised in _python_agg_general, # see see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) @@ -276,8 +264,7 @@ def aggregate(self, func=None, *args, **kwargs): if not self.as_index: # pragma: no cover print("Warning, ignoring as_index=True") - # _level handled at higher - if not _level and isinstance(ret, dict): + if isinstance(ret, dict): from pandas import concat ret = concat(ret, axis=1) @@ -285,23 +272,14 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - def _aggregate_multiple_funcs(self, arg, _level): + def _aggregate_multiple_funcs(self, arg): if isinstance(arg, dict): # show the deprecation, but only if we # have not shown a higher level one # GH 15931 - if isinstance(self._selected_obj, Series) and _level <= 1: - msg = dedent( - """\ - using a dict on a Series for aggregation - is deprecated and will be removed in a future version. Use \ - named aggregation instead. - - >>> grouper.agg(name_1=func_1, name_2=func_2) - """ - ) - warnings.warn(msg, FutureWarning, stacklevel=3) + if isinstance(self._selected_obj, Series): + raise SpecificationError("nested renamer is not supported") columns = list(arg.keys()) arg = arg.items() @@ -323,8 +301,7 @@ def _aggregate_multiple_funcs(self, arg, _level): obj = self if name in results: raise SpecificationError( - "Function names must be unique, found multiple named " - "{name}".format(name=name) + f"Function names must be unique, found multiple named {name}" ) # reset the cache so that we @@ -337,33 +314,95 @@ def _aggregate_multiple_funcs(self, arg, _level): if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle - if _level: - return results + return results return DataFrame(results, columns=columns) - def _wrap_series_output(self, output, index, names=None): - """ common agg/transform wrapping logic """ - output = output[self._selection_name] + def _wrap_series_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, + ) -> Union[Series, DataFrame]: + """ + Wraps the output of a SeriesGroupBy operation into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + index : pd.Index + Index to apply to the output. + + Returns + ------- + Series or DataFrame + + Notes + ----- + In the vast majority of cases output and columns will only contain one + element. The exception is operations that expand dimensions, like ohlc. + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) - if names is not None: - return DataFrame(output, index=index, columns=names) + result: Union[Series, DataFrame] + if len(output) > 1: + result = DataFrame(indexed_output, index=index) + result.columns = columns else: - name = self._selection_name - if name is None: - name = self._selected_obj.name - return Series(output, index=index, name=name) + result = Series(indexed_output[0], index=index, name=columns[0]) + + return result + + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> Union[Series, DataFrame]: + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + Series or DataFrame - def _wrap_aggregated_output(self, output, names=None): + Notes + ----- + In the vast majority of cases output will only contain one element. + The exception is operations that expand dimensions, like ohlc. + """ result = self._wrap_series_output( - output=output, index=self.grouper.result_index, names=names + output=output, index=self.grouper.result_index ) return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output, names=None): - return self._wrap_series_output( - output=output, index=self.obj.index, names=names - ) + def _wrap_transformed_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> Series: + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. + + Parameters + ---------- + output : dict[base.OutputKey, Union[Series, np.ndarray]] + Dict with a sole key of 0 and a value of the result values. + + Returns + ------- + Series + + Notes + ----- + output should always contain one element. It is specified as a dict + for consistency with DataFrame methods and _wrap_aggregated_output. + """ + assert len(output) == 1 + result = self._wrap_series_output(output=output, index=self.obj.index) + + # No transformations increase the ndim of the result + assert isinstance(result, Series) + return result def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: @@ -405,7 +444,7 @@ def _aggregate_named(self, func, *args, **kwargs): output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): raise ValueError("Must produce aggregated value") - result[name] = self._try_cast(output, group) + result[name] = output return result @@ -414,35 +453,39 @@ def _aggregate_named(self, func, *args, **kwargs): def transform(self, func, *args, **kwargs): func = self._get_cython_func(func) or func - if isinstance(func, str): - if not (func in base.transform_kernel_whitelist): - msg = "'{func}' is not a valid function name for transform(name)" - raise ValueError(msg.format(func=func)) - if func in base.cythonized_kernels: - # cythonized transform or canned "agg+broadcast" - return getattr(self, func)(*args, **kwargs) - else: - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - return self._transform_fast( - lambda: getattr(self, func)(*args, **kwargs), func - ) + if not isinstance(func, str): + return self._transform_general(func, *args, **kwargs) + + elif func not in base.transform_kernel_whitelist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels: + # cythonized transform or canned "agg+broadcast" + return getattr(self, func)(*args, **kwargs) + + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + result = getattr(self, func)(*args, **kwargs) + return self._transform_fast(result, func) - # reg transform + def _transform_general(self, func, *args, **kwargs): + """ + Transform with a non-str `func`. + """ klass = self._selected_obj.__class__ + results = [] - wrapper = lambda x: func(x, *args, **kwargs) for name, group in self: object.__setattr__(group, "name", name) - res = wrapper(group) + res = func(group, *args, **kwargs) if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values indexer = self._get_index(name) - s = klass(res, indexer) - results.append(s) + ser = klass(res, indexer) + results.append(ser) # check for empty "results" to avoid concat ValueError if results: @@ -453,7 +496,7 @@ def transform(self, func, *args, **kwargs): result = Series() # we will only try to coerce the result type if - # we have a numeric dtype, as these are *always* udfs + # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) dtype = self._selected_obj.dtype if is_numeric_dtype(dtype): @@ -463,17 +506,14 @@ def transform(self, func, *args, **kwargs): result.index = self._selected_obj.index return result - def _transform_fast(self, func, func_nm) -> Series: + def _transform_fast(self, result, func_nm: str) -> Series: """ fast version of transform, only applicable to builtin/cythonizable functions """ - if isinstance(func, str): - func = getattr(self, func) - ids, _, ngroup = self.grouper.group_info cast = self._transform_should_cast(func_nm) - out = algorithms.take_1d(func()._values, ids) + out = algorithms.take_1d(result._values, ids) if cast: out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) @@ -547,7 +587,7 @@ def nunique(self, dropna: bool = True) -> Series: try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes - msg = "val.dtype must be object, got {}".format(val.dtype) + msg = f"val.dtype must be object, got {val.dtype}" assert val.dtype == object, msg val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) @@ -589,7 +629,8 @@ def nunique(self, dropna: bool = True) -> Series: res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - return Series(res, index=ri, name=self._selection_name) + result = Series(res, index=ri, name=self._selection_name) + return self._reindex_output(result, fill_value=0) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -741,12 +782,13 @@ def count(self) -> Series: minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - return Series( + result = Series( out, index=self.grouper.result_index, name=self._selection_name, dtype="int64", ) + return self._reindex_output(result, fill_value=0) def _apply_to_column_groupbys(self, func): """ return a pass thru """ @@ -864,7 +906,6 @@ class DataFrameGroupBy(GroupBy): ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - _level = kwargs.pop("_level", None) relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: @@ -877,7 +918,7 @@ def aggregate(self, func=None, *args, **kwargs): func = _maybe_mangle_lambdas(func) - result, how = self._aggregate(func, _level=_level, *args, **kwargs) + result, how = self._aggregate(func, *args, **kwargs) if how is None: return result @@ -888,31 +929,21 @@ def aggregate(self, func=None, *args, **kwargs): return self._python_agg_general(func, *args, **kwargs) elif args or kwargs: result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + result = self._aggregate_frame(func) + else: # try to treat as if we are passing a list try: - result = self._aggregate_multiple_funcs( - [func], _level=_level, _axis=self.axis - ) + result = self._aggregate_multiple_funcs([func], _axis=self.axis) except ValueError as err: if "no results" not in str(err): # raised directly by _aggregate_multiple_funcs raise result = self._aggregate_frame(func) - except NotImplementedError as err: - if "axis other than 0 is not supported" in str(err): - # raised directly by _aggregate_multiple_funcs - pass - elif "decimal does not support skipna=True" in str(err): - # FIXME: kludge for DecimalArray tests - pass - else: - raise - # FIXME: this is raised in a bunch of - # test_whitelist.test_regression_whitelist_methods tests, - # can be avoided - result = self._aggregate_frame(func) else: result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name @@ -932,20 +963,20 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def _iterate_slices(self) -> Iterable[Series]: obj = self._selected_obj if self.axis == 1: obj = obj.T if isinstance(obj, Series) and obj.name not in self.exclusions: # Occurs when doing DataFrameGroupBy(...)["X"] - yield obj.name, obj + yield obj else: for label, values in obj.items(): if label in self.exclusions: continue - yield label, values + yield values def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -997,7 +1028,7 @@ def _cython_agg_blocks( # reductions; see GH#28949 obj = obj.iloc[:, 0] - s = groupby(obj, self.grouper) + s = get_groupby(obj, self.grouper) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: @@ -1067,23 +1098,23 @@ def _cython_agg_blocks( return new_items, new_blocks - def _aggregate_frame(self, func, *args, **kwargs): + def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") axis = self.axis obj = self._obj_with_exclusions - result = OrderedDict() + result: OrderedDict = OrderedDict() if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) + result[name] = fres else: for name in self.indices: data = self.get_group(name, obj=obj) fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) + result[name] = fres return self._wrap_frame_output(result, obj) @@ -1091,7 +1122,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: # only for axis==0 obj = self._obj_with_exclusions - result = OrderedDict() # type: dict + result: OrderedDict = OrderedDict() cannot_agg = [] errors = None for item in obj: @@ -1125,17 +1156,6 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return DataFrame(result, columns=result_columns) - def _decide_output_index(self, output, labels): - if len(output) == len(labels): - output_keys = labels - else: - output_keys = sorted(output) - - if isinstance(labels, MultiIndex): - output_keys = MultiIndex.from_tuples(output_keys, names=labels.names) - - return output_keys - def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return DataFrame(index=keys) @@ -1195,7 +1215,7 @@ def first_not_none(values): if isinstance(v, (np.ndarray, Index, Series)): if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = _all_indexes_same([x.index for x in values]) + all_indexed_same = all_indexes_same([x.index for x in values]) singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 @@ -1361,21 +1381,21 @@ def transform(self, func, *args, **kwargs): # optimized transforms func = self._get_cython_func(func) or func - if isinstance(func, str): - if not (func in base.transform_kernel_whitelist): - msg = "'{func}' is not a valid function name for transform(name)" - raise ValueError(msg.format(func=func)) - if func in base.cythonized_kernels: - # cythonized transformation or canned "reduction+broadcast" - return getattr(self, func)(*args, **kwargs) - else: - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - result = getattr(self, func)(*args, **kwargs) - else: + if not isinstance(func, str): return self._transform_general(func, *args, **kwargs) + elif func not in base.transform_kernel_whitelist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels: + # cythonized transformation or canned "reduction+broadcast" + return getattr(self, func)(*args, **kwargs) + + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + result = getattr(self, func)(*args, **kwargs) + # a reduction transform if not isinstance(result, DataFrame): return self._transform_general(func, *args, **kwargs) @@ -1386,9 +1406,9 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) - return self._transform_fast(result, obj, func) + return self._transform_fast(result, func) - def _transform_fast(self, result: DataFrame, obj: DataFrame, func_nm) -> DataFrame: + def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: """ Fast transform path for aggregations """ @@ -1396,12 +1416,16 @@ def _transform_fast(self, result: DataFrame, obj: DataFrame, func_nm) -> DataFra # try casting data to original dtype cast = self._transform_should_cast(func_nm) + obj = self._obj_with_exclusions + # for each col, reshape to to size of original frame # by take operation ids, _, ngroup = self.grouper.group_info output = [] for i, _ in enumerate(result.columns): res = algorithms.take_1d(result.iloc[:, i].values, ids) + # TODO: we have no test cases that get here with EA dtypes; + # try_cast may not be needed if EAs never get here if cast: res = self._try_cast(res, obj.iloc[:, i]) output.append(res) @@ -1421,7 +1445,7 @@ def _define_paths(self, func, *args, **kwargs): ) return fast_path, slow_path - def _choose_path(self, fast_path, slow_path, group): + def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): path = slow_path res = slow_path(group) @@ -1431,8 +1455,8 @@ def _choose_path(self, fast_path, slow_path, group): except AssertionError: raise except Exception: - # Hard to know ex-ante what exceptions `fast_path` might raise - # TODO: no test cases get here + # GH#29631 For user-defined function, we cant predict what may be + # raised; see test_transform.test_transform_fastpath_raises return path, res # verify fast path does not change columns (and names), otherwise @@ -1527,8 +1551,8 @@ def filter(self, func, dropna=True, *args, **kwargs): else: # non scalars aren't allowed raise TypeError( - "filter function returned a {typ}, " - "but expected a scalar bool".format(typ=type(res).__name__) + f"filter function returned a {type(res).__name__}, " + "but expected a scalar bool" ) return self._apply_filter(indices, dropna) @@ -1600,27 +1624,62 @@ def _insert_inaxis_grouper_inplace(self, result): if in_axis: result.insert(0, name, lev) - def _wrap_aggregated_output(self, output, names=None): - agg_axis = 0 if self.axis == 1 else 1 - agg_labels = self._obj_with_exclusions._get_axis(agg_axis) + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy aggregations into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + DataFrame + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) - output_keys = self._decide_output_index(output, agg_labels) + result = DataFrame(indexed_output) + result.columns = columns if not self.as_index: - result = DataFrame(output, columns=output_keys) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index - result = DataFrame(output, index=index, columns=output_keys) + result.index = index if self.axis == 1: result = result.T return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output, names=None) -> DataFrame: - return DataFrame(output, index=self.obj.index) + def _wrap_transformed_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy transformations into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + DataFrame + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result = DataFrame(indexed_output) + result.columns = columns + result.index = self.obj.index + + return result def _wrap_agged_blocks(self, items, blocks): if not self.as_index: @@ -1740,9 +1799,11 @@ def groupby_series(obj, col=None): if isinstance(obj, Series): results = groupby_series(obj) else: + # TODO: this is duplicative of how GroupBy naturally works + # Try to consolidate with normal wrapping functions from pandas.core.reshape.concat import concat - results = [groupby_series(obj[col], col) for col in obj.columns] + results = [groupby_series(content, label) for label, content in obj.items()] results = concat(results, axis=1) results.columns.names = obj.columns.names @@ -1784,7 +1845,7 @@ def _normalize_keyword_aggregation(kwargs): """ Normalize user-provided "named aggregation" kwargs. - Transforms from the new ``Dict[str, NamedAgg]`` style kwargs + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs to the old OrderedDict[str, List[scalar]]]. Parameters @@ -1805,10 +1866,7 @@ def _normalize_keyword_aggregation(kwargs): >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) """ - if not PY36: - kwargs = OrderedDict(sorted(kwargs.items())) - - # Normalize the aggregation functions as Dict[column, List[func]], + # Normalize the aggregation functions as Mapping[column, List[func]], # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to # defaultdict(list) @@ -1893,7 +1951,7 @@ def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: for aggfunc in aggfuncs: if com.get_callable_name(aggfunc) == "": aggfunc = partial(aggfunc) - aggfunc.__name__ = "".format(i) + aggfunc.__name__ = f"" i += 1 mangled_aggfuncs.append(aggfunc) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e73be29d5b104..9e12ac82fb3ae 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -7,14 +7,23 @@ class providing the base-class of operations. expose these user-facing objects to provide specific functionailty. """ -import collections from contextlib import contextmanager import datetime from functools import partial, wraps import inspect import re import types -from typing import FrozenSet, Hashable, Iterable, List, Optional, Tuple, Type, Union +from typing import ( + Dict, + FrozenSet, + Iterable, + List, + Mapping, + Optional, + Tuple, + Type, + Union, +) import numpy as np @@ -26,13 +35,11 @@ class providing the base-class of operations. from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.util._validators import validate_kwargs from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( ensure_float, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, @@ -41,15 +48,15 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna +from pandas._typing import FrameOrSeries, Scalar from pandas.core import nanops import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical, try_cast_to_ea +from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.construction import extract_array from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base +from pandas.core.groupby import base, ops from pandas.core.index import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -338,7 +345,7 @@ def _group_selection_context(groupby): class _GroupBy(PandasObject, SelectionMixin): _group_selection = None - _apply_whitelist = frozenset() # type: FrozenSet[str] + _apply_whitelist: FrozenSet[str] = frozenset() def __init__( self, @@ -346,15 +353,15 @@ def __init__( keys=None, axis: int = 0, level=None, - grouper=None, + grouper: "Optional[ops.BaseGrouper]" = None, exclusions=None, selection=None, - as_index=True, - sort=True, - group_keys=True, - squeeze=False, - observed=False, - **kwargs + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + mutated: bool = False, ): self._selection = selection @@ -376,7 +383,7 @@ def __init__( self.group_keys = group_keys self.squeeze = squeeze self.observed = observed - self.mutated = kwargs.pop("mutated", False) + self.mutated = mutated if grouper is None: from pandas.core.groupby.grouper import get_grouper @@ -396,10 +403,7 @@ def __init__( self.grouper = grouper self.exclusions = set(exclusions) if exclusions else set() - # we accept no other args - validate_kwargs("group", kwargs, {}) - - def __len__(self): + def __len__(self) -> int: return len(self.groups) def __repr__(self) -> str: @@ -443,7 +447,7 @@ def _get_indices(self, names): def get_converter(s): # possibly convert to the actual key types # in the indices, could be a Timestamp or a np.datetime64 - if isinstance(s, (Timestamp, datetime.datetime)): + if isinstance(s, datetime.datetime): return lambda key: Timestamp(key) elif isinstance(s, np.datetime64): return lambda key: Timestamp(key).asm8 @@ -492,6 +496,7 @@ def _get_index(self, name): @cache_readonly def _selected_obj(self): + # Note: _selected_obj is always just `self.obj` for SeriesGroupBy if self._selection is None or isinstance(self.obj, Series): if self._group_selection is not None: @@ -561,9 +566,7 @@ def __getattr__(self, attr): return self[attr] raise AttributeError( - "'{typ}' object has no attribute '{attr}'".format( - typ=type(self).__name__, attr=attr - ) + f"'{type(self).__name__}' object has no attribute '{attr}'" ) @Substitution( @@ -636,24 +639,14 @@ def curried(x): # TODO: is the above comment accurate? raise - # related to : GH3688 - # try item-by-item - # this can be called recursively, so need to raise - # ValueError - # if we don't have this method to indicated to aggregate to - # mark this column as an error - try: - result = self._aggregate_item_by_item(name, *args, **kwargs) - assert self.obj.ndim == 2 - return result - except AttributeError: - # e.g. SparseArray has no flags attr - # FIXME: 'SeriesGroupBy' has no attribute '_aggregate_item_by_item' - # occurs in idxmax() case - # in tests.groupby.test_function.test_non_cython_api - assert self.obj.ndim == 1 + if self.obj.ndim == 1: + # this can be called recursively, so need to raise ValueError raise ValueError + # GH#3688 try to operate item-by-item + result = self._aggregate_item_by_item(name, *args, **kwargs) + return result + wrapper.__name__ = name return wrapper @@ -750,7 +743,7 @@ def _python_apply_general(self, f): keys, values, not_indexed_same=mutated or self.mutated ) - def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): @@ -803,22 +796,11 @@ def _try_cast(self, result, obj, numeric_only: bool = False): dtype = obj.dtype if not is_scalar(result): - if is_datetime64tz_dtype(dtype): - # GH 23683 - # Prior results _may_ have been generated in UTC. - # Ensure we localize to UTC first before converting - # to the target timezone - arr = extract_array(obj) - try: - result = arr._from_sequence(result, dtype="datetime64[ns, UTC]") - result = result.astype(dtype) - except TypeError: - # _try_cast was called at a point where the result - # was already tz-aware - pass - elif is_extension_array_dtype(dtype): + if is_extension_array_dtype(dtype) and dtype.kind != "M": # The function can return something of any type, so check - # if the type is compatible with the calling EA. + # if the type is compatible with the calling EA. + # datetime64tz is handled correctly in agg_series, + # so is excluded here. # return the same type (Series) as our caller cls = dtype.construct_array_type() @@ -845,30 +827,33 @@ def _transform_should_cast(self, func_nm: str) -> bool: ) def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): - output = collections.OrderedDict() # type: dict - for name, obj in self._iterate_slices(): + output: Dict[base.OutputKey, np.ndarray] = {} + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue try: - result, names = self.grouper.transform(obj.values, how, **kwargs) + result, _ = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue + if self._transform_should_cast(how): - output[name] = self._try_cast(result, obj) - else: - output[name] = result + result = self._try_cast(result, obj) + + key = base.OutputKey(label=name, position=idx) + output[key] = result if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_transformed_output(output, names) + return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output, names=None): + def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_transformed_output(self, output, names=None): + def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): @@ -877,27 +862,53 @@ def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output = {} - for name, obj in self._iterate_slices(): + output: Dict[base.OutputKey, Union[np.ndarray, DatetimeArray]] = {} + # Ideally we would be able to enumerate self._iterate_slices and use + # the index from enumeration as the key of output, but ohlc in particular + # returns a (n x 4) array. Output requires 1D ndarrays as values, so we + # need to slice that up into 1D arrays + idx = 0 + for obj in self._iterate_slices(): + name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue - result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) - output[name] = self._try_cast(result, obj) + result, agg_names = self.grouper.aggregate( + obj._values, how, min_count=min_count + ) + + if agg_names: + # e.g. ohlc + assert len(agg_names) == result.shape[1] + for result_column, result_name in zip(result.T, agg_names): + key = base.OutputKey(label=result_name, position=idx) + output[key] = self._try_cast(result_column, obj) + idx += 1 + else: + assert result.ndim == 1 + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj) + idx += 1 if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output, names) + return self._wrap_aggregated_output(output) def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict - output = {} - for name, obj in self._iterate_slices(): + output: Dict[base.OutputKey, np.ndarray] = {} + + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name + if self.grouper.ngroups == 0: + # agg_series below assumes ngroups > 0 + continue + try: # if this function is invalid for this dtype, we will ignore it. func(obj[:0]) @@ -911,10 +922,9 @@ def _python_agg_general(self, func, *args, **kwargs): pass result, counts = self.grouper.agg_series(obj, f) - if result is not None: - # TODO: only 3 test cases get None here, do something - # in those cases - output[name] = self._try_cast(result, obj, numeric_only=True) + assert result is not None + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) @@ -922,14 +932,14 @@ def _python_agg_general(self, func, *args, **kwargs): if self.grouper._filter_empty_groups: mask = counts.ravel() > 0 - for name, result in output.items(): + for key, result in output.items(): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[name] = self._try_cast(values[mask], result) + output[key] = self._try_cast(values[mask], result) return self._wrap_aggregated_output(output) @@ -1226,7 +1236,7 @@ def median(self, **kwargs): return self._cython_agg_general( "median", alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), - **kwargs + **kwargs, ) @Substitution(name="groupby") @@ -1315,7 +1325,7 @@ def size(self): if isinstance(self.obj, Series): result.name = self.obj.name - return result + return self._reindex_output(result, fill_value=0) @classmethod def _add_numeric_operations(cls): @@ -1361,22 +1371,11 @@ def f(self, **kwargs): # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass - elif "decimal does not support skipna=True" in str(err): - # FIXME: kludge for test_decimal:test_in_numeric_groupby - pass else: raise # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - - # coerce the resulting columns if we can - if isinstance(result, DataFrame): - for col in result.columns: - result[col] = self._try_cast(result[col], self.obj[col]) - else: - result = self._try_cast(result, self.obj) - return result set_function_name(f, name, cls) @@ -1770,6 +1769,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra if not self.observed and isinstance(result_index, CategoricalIndex): out = out.reindex(result_index) + out = self._reindex_output(out) return out.sort_index() if self.sort else out # dropna is truthy @@ -1781,7 +1781,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra raise ValueError( "For a DataFrame groupby, dropna must be " "either None, 'any' or 'all', " - "(was passed {dropna}).".format(dropna=dropna) + f"(was passed {dropna})." ) # old behaviour, but with all and any support for DataFrames. @@ -2193,7 +2193,7 @@ def _get_cythonized_result( result_is_index: bool = False, pre_processing=None, post_processing=None, - **kwargs + **kwargs, ): """ Get result for Cythonized functions. @@ -2251,10 +2251,11 @@ def _get_cythonized_result( grouper = self.grouper labels, _, ngroups = grouper.group_info - output = collections.OrderedDict() # type: dict + output: Dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) - for name, obj in self._iterate_slices(): + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name values = obj._data._values if aggregate: @@ -2287,7 +2288,8 @@ def _get_cythonized_result( if post_processing: result = post_processing(result, inferences) - output[name] = result + key = base.OutputKey(label=name, position=idx) + output[key] = result if aggregate: return self._wrap_aggregated_output(output) @@ -2409,7 +2411,9 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] - def _reindex_output(self, output): + def _reindex_output( + self, output: FrameOrSeries, fill_value: Scalar = np.NaN + ) -> FrameOrSeries: """ If we have categorical groupers, then we might want to make sure that we have a fully re-indexed output to the levels. This means expanding @@ -2423,8 +2427,10 @@ def _reindex_output(self, output): Parameters ---------- - output: Series or DataFrame + output : Series or DataFrame Object resulting from grouping and applying an operation. + fill_value : scalar, default np.NaN + Value to use for unobserved categories if self.observed is False. Returns ------- @@ -2455,7 +2461,11 @@ def _reindex_output(self, output): ).sortlevel() if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, "copy": False} + d = { + self.obj._get_axis_name(self.axis): index, + "copy": False, + "fill_value": fill_value, + } return output.reindex(**d) # GH 13204 @@ -2477,7 +2487,9 @@ def _reindex_output(self, output): output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex(index, copy=False) + output = output.set_index(self.grouper.result_index).reindex( + index, copy=False, fill_value=fill_value + ) # Reset in-axis grouper columns # (using level numbers `g_nums` because level names may not be unique) @@ -2490,18 +2502,46 @@ def _reindex_output(self, output): @Appender(GroupBy.__doc__) -def groupby(obj: NDFrame, by, **kwds): +def get_groupby( + obj: NDFrame, + by=None, + axis: int = 0, + level=None, + grouper: "Optional[ops.BaseGrouper]" = None, + exclusions=None, + selection=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + mutated: bool = False, +): + + klass: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]] if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy - klass = ( - SeriesGroupBy - ) # type: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]] + klass = SeriesGroupBy elif isinstance(obj, DataFrame): from pandas.core.groupby.generic import DataFrameGroupBy klass = DataFrameGroupBy else: - raise TypeError("invalid type: {obj}".format(obj=obj)) - - return klass(obj, by, **kwds) + raise TypeError(f"invalid type: {obj}") + + return klass( + obj=obj, + keys=by, + axis=axis, + level=level, + grouper=grouper, + exclusions=exclusions, + selection=selection, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + mutated=mutated, + ) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 370abe75e1327..308d4d1864bdd 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -26,8 +26,8 @@ from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com from pandas.core.frame import DataFrame +from pandas.core.groupby import ops from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby -from pandas.core.groupby.ops import BaseGrouper from pandas.core.index import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series @@ -93,7 +93,7 @@ class Grouper: >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) """ - _attributes = ("key", "level", "freq", "axis", "sort") # type: Tuple[str, ...] + _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") def __new__(cls, *args, **kwargs): if kwargs.get("freq") is not None: @@ -119,7 +119,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): def ax(self): return self.grouper - def _get_grouper(self, obj, validate=True): + def _get_grouper(self, obj, validate: bool = True): """ Parameters ---------- @@ -143,17 +143,18 @@ def _get_grouper(self, obj, validate=True): ) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj, sort=False): + def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification Parameters ---------- - obj : the subject object + obj : Series or DataFrame sort : bool, default False whether the resulting grouper should be sorted """ + assert obj is not None if self.key is not None and self.level is not None: raise ValueError("The Grouper cannot specify both a key and a level!") @@ -172,9 +173,7 @@ def _set_grouper(self, obj, sort=False): ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: - raise KeyError( - "The grouper name {key} is not found".format(key=key) - ) + raise KeyError(f"The grouper name {key} is not found") ax = Index(obj[key], name=key) else: @@ -190,9 +189,7 @@ def _set_grouper(self, obj, sort=False): else: if level not in (0, ax.name): - raise ValueError( - "The level {level} is not valid".format(level=level) - ) + raise ValueError(f"The level {level} is not valid") # possibly sort if (self.sort or sort) and not ax.is_monotonic: @@ -211,13 +208,13 @@ def groups(self): def __repr__(self) -> str: attrs_list = ( - "{}={!r}".format(attr_name, getattr(self, attr_name)) + f"{attr_name}={getattr(self, attr_name)!r}" for attr_name in self._attributes if getattr(self, attr_name) is not None ) attrs = ", ".join(attrs_list) cls_name = self.__class__.__name__ - return "{}({})".format(cls_name, attrs) + return f"{cls_name}({attrs})" class Grouping: @@ -279,17 +276,17 @@ def __init__( if level is not None: if not isinstance(level, int): if level not in index.names: - raise AssertionError( - "Level {level} not in index".format(level=level) - ) + raise AssertionError(f"Level {level} not in index") level = index.names.index(level) if self.name is None: self.name = index.names[level] - self.grouper, self._codes, self._group_index = index._get_grouper_for_level( # noqa: E501 - self.grouper, level - ) + ( + self.grouper, + self._codes, + self._group_index, + ) = index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes @@ -347,17 +344,16 @@ def __init__( ): if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) - raise ValueError("Grouper for '{t}' not 1-dimensional".format(t=t)) + raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) if not ( hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index) ): + grper = pprint_thing(self.grouper) errmsg = ( "Grouper result violates len(labels) == " - "len(data)\nresult: {grper}".format( - grper=pprint_thing(self.grouper) - ) + f"len(data)\nresult: {grper}" ) self.grouper = None # Try for sanity raise AssertionError(errmsg) @@ -372,13 +368,13 @@ def __init__( self.grouper = self.grouper.astype("timedelta64[ns]") def __repr__(self) -> str: - return "Grouping({0})".format(self.name) + return f"Grouping({self.name})" def __iter__(self): return iter(self.indices) - _codes = None # type: np.ndarray - _group_index = None # type: Index + _codes: Optional[np.ndarray] = None + _group_index: Optional[Index] = None @property def ngroups(self) -> int: @@ -387,7 +383,7 @@ def ngroups(self) -> int: @cache_readonly def indices(self): # we have a list of groupers - if isinstance(self.grouper, BaseGrouper): + if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices values = ensure_categorical(self.grouper) @@ -409,12 +405,13 @@ def result_index(self) -> Index: def group_index(self) -> Index: if self._group_index is None: self._make_codes() + assert self._group_index is not None return self._group_index def _make_codes(self) -> None: if self._codes is None or self._group_index is None: # we have a list of groupers - if isinstance(self.grouper, BaseGrouper): + if isinstance(self.grouper, ops.BaseGrouper): codes = self.grouper.codes_info uniques = self.grouper.result_index else: @@ -433,11 +430,11 @@ def get_grouper( key=None, axis: int = 0, level=None, - sort=True, - observed=False, - mutated=False, - validate=True, -) -> Tuple[BaseGrouper, List[Hashable], FrameOrSeries]: + sort: bool = True, + observed: bool = False, + mutated: bool = False, + validate: bool = True, +) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -497,11 +494,7 @@ def get_grouper( if isinstance(level, str): if obj.index.name != level: - raise ValueError( - "level name {level} is not the name of the index".format( - level=level - ) - ) + raise ValueError(f"level name {level} is not the name of the index") elif level > 0 or level < -1: raise ValueError("level > 0 or level < -1 only valid with MultiIndex") @@ -519,7 +512,7 @@ def get_grouper( return grouper, [key.key], obj # already have a BaseGrouper, just return it - elif isinstance(key, BaseGrouper): + elif isinstance(key, ops.BaseGrouper): return key, [], obj # In the future, a tuple key will always mean an actual key, @@ -584,8 +577,8 @@ def get_grouper( else: levels = [level] * len(keys) - groupings = [] # type: List[Grouping] - exclusions = [] # type: List[Hashable] + groupings: List[Grouping] = [] + exclusions: List[Hashable] = [] # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: @@ -633,12 +626,8 @@ def is_in_obj(gpr) -> bool: if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( - ( - "Length of grouper ({len_gpr}) and axis ({len_axis})" - " must be same length".format( - len_gpr=len(gpr), len_axis=obj.shape[axis] - ) - ) + f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]})" + " must be same length" ) # create the Grouping @@ -666,11 +655,11 @@ def is_in_obj(gpr) -> bool: groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) + grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj -def _is_label_like(val): +def _is_label_like(val) -> bool: return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fec472f503c9f..4780254e060e6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -36,6 +36,7 @@ ) from pandas.core.dtypes.missing import _maybe_fill, isna +from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -89,12 +90,16 @@ def __init__( self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis - self.groupings = groupings # type: Sequence[grouper.Grouping] + self._groupings: List[grouper.Grouping] = list(groupings) self.sort = sort self.group_keys = group_keys self.mutated = mutated self.indexer = indexer + @property + def groupings(self) -> List["grouper.Grouping"]: + return self._groupings + @property def shape(self): return tuple(ping.ngroups for ping in self.groupings) @@ -106,7 +111,7 @@ def __iter__(self): def nkeys(self) -> int: return len(self.groupings) - def get_iterator(self, data, axis=0): + def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator @@ -120,7 +125,7 @@ def get_iterator(self, data, axis=0): for key, (i, group) in zip(keys, splitter): yield key, group - def _get_splitter(self, data, axis=0): + def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -142,13 +147,13 @@ def _get_group_keys(self): # provide "flattened" iterator for multi-group setting return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) - def apply(self, f, data, axis: int = 0): + def apply(self, f, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() result_values = None - sdata = splitter._get_sorted_data() + sdata: FrameOrSeries = splitter._get_sorted_data() if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray @@ -157,29 +162,28 @@ def apply(self, f, data, axis: int = 0): elif ( com.get_callable_name(f) not in base.plotting_methods - and hasattr(splitter, "fast_apply") + and isinstance(splitter, FrameSplitter) and axis == 0 - # with MultiIndex, apply_frame_axis0 would raise InvalidApply - # TODO: can we make this check prettier? - and not sdata.index._has_complex_internals + # apply_frame_axis0 doesn't allow MultiIndex + and not isinstance(sdata.index, MultiIndex) ): try: result_values, mutated = splitter.fast_apply(f, group_keys) - # If the fast apply path could be used we can return here. - # Otherwise we need to fall back to the slow implementation. - if len(result_values) == len(group_keys): - return group_keys, result_values, mutated - except libreduction.InvalidApply as err: - # Cannot fast apply on MultiIndex (_has_complex_internals). - # This Exception is also raised if `f` triggers an exception + # This Exception is raised if `f` triggers an exception # but it is preferable to raise the exception in Python. if "Let this error raise above us" not in str(err): # TODO: can we infer anything about whether this is # worth-retrying in pure-python? raise + else: + # If the fast apply path could be used we can return here. + # Otherwise we need to fall back to the slow implementation. + if len(result_values) == len(group_keys): + return group_keys, result_values, mutated + for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, "name", key) @@ -197,7 +201,7 @@ def apply(self, f, data, axis: int = 0): continue # group might be modified - group_axes = _get_axes(group) + group_axes = group.axes res = f(group) if not _is_indexed_like(res, group_axes): mutated = True @@ -229,8 +233,7 @@ def names(self): def size(self) -> Series: """ - Compute group sizes - + Compute group sizes. """ ids, _, ngroup = self.group_info ids = ensure_platform_int(ids) @@ -292,7 +295,7 @@ def reconstructed_codes(self) -> List[np.ndarray]: return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True) @cache_readonly - def result_index(self): + def result_index(self) -> Index: if not self.compressed and len(self.groupings) == 1: return self.groupings[0].result_index.rename(self.names[0]) @@ -355,40 +358,33 @@ def _is_builtin_func(self, arg): def _get_cython_function(self, kind: str, how: str, values, is_numeric: bool): dtype_str = values.dtype.name + ftype = self._cython_functions[kind][how] - def get_func(fname): - # see if there is a fused-type version of function - # only valid for numeric - f = getattr(libgroupby, fname, None) - if f is not None and is_numeric: - return f - - # otherwise find dtype-specific version, falling back to object - for dt in [dtype_str, "object"]: - f2 = getattr( - libgroupby, - "{fname}_{dtype_str}".format(fname=fname, dtype_str=dt), - None, - ) - if f2 is not None: - return f2 - - if hasattr(f, "__signatures__"): - # inspect what fused types are implemented - if dtype_str == "object" and "object" not in f.__signatures__: - # return None so we get a NotImplementedError below - # instead of a TypeError at runtime - return None + # see if there is a fused-type version of function + # only valid for numeric + f = getattr(libgroupby, ftype, None) + if f is not None and is_numeric: return f - ftype = self._cython_functions[kind][how] + # otherwise find dtype-specific version, falling back to object + for dt in [dtype_str, "object"]: + f2 = getattr(libgroupby, f"{ftype}_{dt}", None) + if f2 is not None: + return f2 + + if hasattr(f, "__signatures__"): + # inspect what fused types are implemented + if dtype_str == "object" and "object" not in f.__signatures__: + # disallow this function so we get a NotImplementedError below + # instead of a TypeError at runtime + f = None - func = get_func(ftype) + func = f if func is None: raise NotImplementedError( - "function is not implemented for this dtype: " - "[how->{how},dtype->{dtype_str}]".format(how=how, dtype_str=dtype_str) + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" ) return func @@ -428,8 +424,15 @@ def _get_cython_func_and_vals( return func, values def _cython_operation( - self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs - ): + self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs + ) -> Tuple[np.ndarray, Optional[List[str]]]: + """ + Returns the values of a cython operation as a Tuple of [data, names]. + + Names is only useful when dealing with 2D results, like ohlc + (see self._name_functions). + """ + assert kind in ["transform", "aggregate"] orig_values = values @@ -449,18 +452,16 @@ def _cython_operation( # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): - raise NotImplementedError( - "{dtype} dtype not supported".format(dtype=values.dtype) - ) + raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( - "datetime64 type does not support {how} operations".format(how=how) + f"datetime64 type does not support {how} operations" ) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( - "timedelta64 type does not support {how} operations".format(how=how) + f"timedelta64 type does not support {how} operations" ) if is_datetime64tz_dtype(values.dtype): @@ -513,9 +514,7 @@ def _cython_operation( out_dtype = "float" else: if is_numeric: - out_dtype = "{kind}{itemsize}".format( - kind=values.dtype.kind, itemsize=values.dtype.itemsize - ) + out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" @@ -552,7 +551,7 @@ def _cython_operation( if vdim == 1 and arity == 1: result = result[:, 0] - names = self._name_functions.get(how, None) # type: Optional[List[str]] + names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) @@ -601,33 +600,40 @@ def _transform( return result def agg_series(self, obj: Series, func): - if is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": + # Caller is responsible for checking ngroups != 0 + assert self.ngroups != 0 + + if len(obj) == 0: + # SeriesGrouper would raise if we were to call _aggregate_series_fast + return self._aggregate_series_pure_python(obj, func) + + elif is_extension_array_dtype(obj.dtype): # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider + # In the datetime64tz case it would incorrectly cast to tz-naive # TODO: can we get a performant workaround for EAs backed by ndarray? - # TODO: is the datetime64tz case supposed to go through here? return self._aggregate_series_pure_python(obj, func) - elif obj.index._has_complex_internals: + elif isinstance(obj.index, MultiIndex): # MultiIndex; Pre-empt TypeError in _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) try: return self._aggregate_series_fast(obj, func) except ValueError as err: - if "No result." in str(err): - # raised in libreduction - pass - elif "Function does not reduce" in str(err): + if "Function does not reduce" in str(err): # raised in libreduction pass else: raise return self._aggregate_series_pure_python(obj, func) - def _aggregate_series_fast(self, obj, func): - # At this point we have already checked that obj.index is not a MultiIndex - # and that obj is backed by an ndarray, not ExtensionArray + def _aggregate_series_fast(self, obj: Series, func): + # At this point we have already checked that + # - obj.index is not a MultiIndex + # - obj is backed by an ndarray, not ExtensionArray + # - len(obj) > 0 + # - ngroups != 0 func = self._is_builtin_func(func) group_index, _, ngroups = self.group_info @@ -641,7 +647,7 @@ def _aggregate_series_fast(self, obj, func): result, counts = grouper.get_result() return result, counts - def _aggregate_series_pure_python(self, obj, func): + def _aggregate_series_pure_python(self, obj: Series, func): group_index, _, ngroups = self.group_info @@ -654,17 +660,23 @@ def _aggregate_series_pure_python(self, obj, func): res = func(group) if result is None: if isinstance(res, (Series, Index, np.ndarray)): - raise ValueError("Function does not reduce") + if len(res) == 1: + # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) + # FIXME: are we potentially losing import res.index info? + + # TODO: use `.item()` if/when we un-deprecate it. + # For non-Series we could just do `res[0]` + res = next(iter(res)) + else: + raise ValueError("Function does not reduce") result = np.empty(ngroups, dtype="O") counts[label] = group.shape[0] result[label] = res - if result is not None: - # if splitter is empty, result can be None, in which case - # maybe_convert_objects would raise TypeError - result = lib.maybe_convert_objects(result, try_float=0) - # TODO: try_cast back to EA? + assert result is not None + result = lib.maybe_convert_objects(result, try_float=0) + # TODO: try_cast back to EA? return result, counts @@ -700,7 +712,12 @@ class BinGrouper(BaseGrouper): """ def __init__( - self, bins, binlabels, filter_empty=False, mutated=False, indexer=None + self, + bins, + binlabels, + filter_empty: bool = False, + mutated: bool = False, + indexer=None, ): self.bins = ensure_int64(bins) self.binlabels = ensure_index(binlabels) @@ -708,6 +725,10 @@ def __init__( self.mutated = mutated self.indexer = indexer + # These lengths must match, otherwise we could call agg_series + # with empty self.bins, which would raise in libreduction. + assert len(self.binlabels) == len(self.bins) + @cache_readonly def groups(self): """ dict {group name -> group labels} """ @@ -734,7 +755,7 @@ def _get_grouper(self): """ return self - def get_iterator(self, data: NDFrame, axis: int = 0): + def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator @@ -806,18 +827,19 @@ def names(self): return [self.binlabels.name] @property - def groupings(self): - from pandas.core.groupby.grouper import Grouping - + def groupings(self) -> "List[grouper.Grouping]": return [ - Grouping(lvl, lvl, in_axis=False, level=None, name=name) + grouper.Grouping(lvl, lvl, in_axis=False, level=None, name=name) for lvl, name in zip(self.levels, self.names) ] def agg_series(self, obj: Series, func): + # Caller is responsible for checking ngroups != 0 + assert self.ngroups != 0 + assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result + if is_extension_array_dtype(obj.dtype): - # pre-empty SeriesBinGrouper from raising TypeError - # TODO: watch out, this can return None + # pre-empt SeriesBinGrouper from raising TypeError return self._aggregate_series_pure_python(obj, func) dummy = obj[:0] @@ -825,13 +847,6 @@ def agg_series(self, obj: Series, func): return grouper.get_result() -def _get_axes(group): - if isinstance(group, Series): - return [group.index] - else: - return group.axes - - def _is_indexed_like(obj, axes) -> bool: if isinstance(obj, Series): if len(axes) > 1: @@ -848,7 +863,7 @@ def _is_indexed_like(obj, axes) -> bool: class DataSplitter: - def __init__(self, data, labels, ngroups, axis: int = 0): + def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): self.data = data self.labels = ensure_int64(labels) self.ngroups = ngroups @@ -879,15 +894,15 @@ def __iter__(self): for i, (start, end) in enumerate(zip(starts, ends)): yield i, self._chop(sdata, slice(start, end)) - def _get_sorted_data(self): + def _get_sorted_data(self) -> FrameOrSeries: return self.data.take(self.sort_idx, axis=self.axis) - def _chop(self, sdata, slice_obj: slice): + def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) class SeriesSplitter(DataSplitter): - def _chop(self, sdata, slice_obj: slice): + def _chop(self, sdata: Series, slice_obj: slice) -> Series: return sdata._get_values(slice_obj) @@ -899,16 +914,16 @@ def fast_apply(self, f, names): sdata = self._get_sorted_data() return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) - def _chop(self, sdata, slice_obj: slice): + def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: if self.axis == 0: return sdata.iloc[slice_obj] else: return sdata._slice(slice_obj, axis=1) -def get_splitter(data: NDFrame, *args, **kwargs): +def get_splitter(data: FrameOrSeries, *args, **kwargs) -> DataSplitter: if isinstance(data, Series): - klass = SeriesSplitter # type: Type[DataSplitter] + klass: Type[DataSplitter] = SeriesSplitter else: # i.e. DataFrame klass = FrameSplitter diff --git a/pandas/core/index.py b/pandas/core/index.py index d308ac1a9b1c7..84b37b8bd659d 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -13,13 +13,9 @@ RangeIndex, TimedeltaIndex, UInt64Index, - _all_indexes_same, - _get_combined_index, - _get_consensus_names, - _get_objs_combined_axis, _new_Index, - _union_indexes, ensure_index, ensure_index_from_sequences, + get_objs_combined_axis, ) from pandas.core.indexes.multi import _sparsify # noqa:F401 diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 86d55ce2e7cc3..f650a62bc5b74 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,4 +1,5 @@ import textwrap +from typing import List, Set import warnings from pandas._libs import NaT, lib @@ -6,23 +7,23 @@ import pandas.core.common as com from pandas.core.indexes.base import ( Index, + InvalidIndexError, _new_Index, ensure_index, ensure_index_from_sequences, ) -from pandas.core.indexes.base import InvalidIndexError # noqa:F401 -from pandas.core.indexes.category import CategoricalIndex # noqa:F401 +from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.interval import IntervalIndex # noqa:F401 -from pandas.core.indexes.multi import MultiIndex # noqa:F401 -from pandas.core.indexes.numeric import ( # noqa:F401 +from pandas.core.indexes.interval import IntervalIndex +from pandas.core.indexes.multi import MultiIndex +from pandas.core.indexes.numeric import ( Float64Index, Int64Index, NumericIndex, UInt64Index, ) from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.range import RangeIndex # noqa:F401 +from pandas.core.indexes.range import RangeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex _sort_msg = textwrap.dedent( @@ -57,15 +58,16 @@ "NaT", "ensure_index", "ensure_index_from_sequences", - "_get_combined_index", - "_get_objs_combined_axis", - "_union_indexes", - "_get_consensus_names", - "_all_indexes_same", + "get_objs_combined_axis", + "union_indexes", + "get_consensus_names", + "all_indexes_same", ] -def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): +def get_objs_combined_axis( + objs, intersect: bool = False, axis=0, sort: bool = True +) -> Index: """ Extract combined index: return intersection or union (depending on the value of "intersect") of indexes on given axis, or None if all objects @@ -73,9 +75,8 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): Parameters ---------- - objs : list of objects - Each object will only be considered if it has a _get_axis - attribute. + objs : list + Series or DataFrame objects, may be mix of the two. intersect : bool, default False If True, calculate the intersection between indexes. Otherwise, calculate the union. @@ -88,26 +89,27 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): ------- Index """ - obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, "_get_axis")] - if obs_idxes: - return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) + obs_idxes = [obj._get_axis(axis) for obj in objs] + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_distinct_objs(objs): +def _get_distinct_objs(objs: List[Index]) -> List[Index]: """ Return a list with distinct elements of "objs" (different ids). Preserves order. """ - ids = set() + ids: Set[int] = set() res = [] for obj in objs: - if not id(obj) in ids: + if id(obj) not in ids: ids.add(id(obj)) res.append(obj) return res -def _get_combined_index(indexes, intersect=False, sort=False): +def _get_combined_index( + indexes: List[Index], intersect: bool = False, sort: bool = False +) -> Index: """ Return the union or intersection of indexes. @@ -137,7 +139,7 @@ def _get_combined_index(indexes, intersect=False, sort=False): for other in indexes[1:]: index = index.intersection(other) else: - index = _union_indexes(indexes, sort=sort) + index = union_indexes(indexes, sort=sort) index = ensure_index(index) if sort: @@ -148,7 +150,7 @@ def _get_combined_index(indexes, intersect=False, sort=False): return index -def _union_indexes(indexes, sort=True): +def union_indexes(indexes, sort=True) -> Index: """ Return the union of indexes. @@ -174,7 +176,7 @@ def _union_indexes(indexes, sort=True): indexes, kind = _sanitize_and_check(indexes) - def _unique_indices(inds): + def _unique_indices(inds) -> Index: """ Convert indexes to lists and concatenate them, removing duplicates. @@ -217,7 +219,7 @@ def conv(i): return _unique_indices(indexes) - name = _get_consensus_names(indexes)[0] + name = get_consensus_names(indexes)[0] if name != index.name: index = index._shallow_copy(name=name) return index @@ -264,7 +266,7 @@ def _sanitize_and_check(indexes): return indexes, "array" -def _get_consensus_names(indexes): +def get_consensus_names(indexes): """ Give a consensus 'names' to indexes. @@ -289,7 +291,7 @@ def _get_consensus_names(indexes): return [None] * indexes[0].nlevels -def _all_indexes_same(indexes): +def all_indexes_same(indexes): """ Determine if all indexes contain the same elements. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c9697c530628a..dd38bd0ee5f70 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -54,6 +54,7 @@ ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, + ABCIntervalIndex, ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, @@ -204,11 +205,11 @@ class Index(IndexOpsMixin, PandasObject): """ # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = ( + _deprecations: FrozenSet[str] = ( PandasObject._deprecations | IndexOpsMixin._deprecations | frozenset(["asobject", "contains", "dtype_str", "get_values", "set_value"]) - ) # type: FrozenSet[str] + ) # To hand over control to subclasses _join_precedence = 1 @@ -230,7 +231,7 @@ def _outer_indexer(self, left, right): return libjoin.outer_join_indexer(left, right) _typ = "index" - _data = None + _data: Union[ExtensionArray, np.ndarray] _id = None name = None _comparables = ["name"] @@ -265,7 +266,7 @@ def __new__( name=None, fastpath=None, tupleize_cols=True, - **kwargs + **kwargs, ) -> "Index": from .range import RangeIndex @@ -320,10 +321,9 @@ def __new__( # the DatetimeIndex construction. # Note we can pass copy=False because the .astype below # will always make a copy - result = DatetimeIndex( - data, copy=False, name=name, **kwargs - ) # type: "Index" - return result.astype(object) + return DatetimeIndex(data, copy=False, name=name, **kwargs).astype( + object + ) else: return DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) @@ -331,8 +331,9 @@ def __new__( if is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy - result = TimedeltaIndex(data, copy=False, name=name, **kwargs) - return result.astype(object) + return TimedeltaIndex(data, copy=False, name=name, **kwargs).astype( + object + ) else: return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) @@ -380,7 +381,7 @@ def __new__( pass # Return an actual float index. - return Float64Index(data, copy=copy, dtype=dtype, name=name) + return Float64Index(data, copy=copy, name=name) elif inferred == "string": pass @@ -450,7 +451,9 @@ def __new__( return PeriodIndex(subarr, name=name, **kwargs) except IncompatibleFrequency: pass - return cls._simple_new(subarr, name) + if kwargs: + raise TypeError(f"Unexpected keyword arguments {set(kwargs)!r}") + return cls._simple_new(subarr, name, **kwargs) elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) @@ -506,7 +509,7 @@ def asi8(self): return None @classmethod - def _simple_new(cls, values, name=None, dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None): """ We require that we have a dtype compat for the values. If we are passed a non-dtype compat, then coerce using the constructor. @@ -528,8 +531,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): # we actually set this value too. result._index_data = values result.name = name - for k, v in kwargs.items(): - setattr(result, k, v) + return result._reset_identity() @cache_readonly @@ -607,7 +609,7 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") - def is_(self, other): + def is_(self, other) -> bool: """ More flexible, faster check like ``is`` but that works through views. @@ -649,7 +651,7 @@ def _engine(self): # Array-Like Methods # ndarray compat - def __len__(self): + def __len__(self) -> int: """ Return the length of the Index. """ @@ -961,14 +963,14 @@ def __repr__(self): data = self._format_data() attrs = self._format_attrs() space = self._format_space() - - prepr = (",%s" % space).join("%s=%s" % (k, v) for k, v in attrs) + attrs_str = [f"{k}={v}" for k, v in attrs] + prepr = f",{space}".join(attrs_str) # no data provided, just attributes if data is None: data = "" - res = "%s(%s%s)" % (klass, data, prepr) + res = f"{klass}({data}{prepr})" return res @@ -1122,26 +1124,13 @@ def _summary(self, name=None): tail = self[-1] if hasattr(tail, "format") and not isinstance(tail, str): tail = tail.format() - index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail)) + index_summary = f", {head} to {tail}" else: index_summary = "" if name is None: name = type(self).__name__ - return "%s: %s entries%s" % (name, len(self), index_summary) - - def summary(self, name=None): - """ - Return a summarized representation. - - .. deprecated:: 0.23.0 - """ - warnings.warn( - "'summary' is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - return self._summary(name) + return f"{name}: {len(self)} entries{index_summary}" # -------------------------------------------------------------------- # Conversion Methods @@ -1302,7 +1291,7 @@ def _set_names(self, values, level=None): if not is_list_like(values): raise ValueError("Names must be a list-like") if len(values) != 1: - raise ValueError("Length of new names must be 1, got %d" % len(values)) + raise ValueError(f"Length of new names must be 1, got {len(values)}") # GH 20527 # All items in 'name' need to be hashable: @@ -1450,7 +1439,7 @@ def rename(self, name, inplace=False): # Level-Centric Methods @property - def nlevels(self): + def nlevels(self) -> int: """ Number of levels. """ @@ -1473,8 +1462,8 @@ def _validate_index_level(self, level): if isinstance(level, int): if level < 0 and level != -1: raise IndexError( - "Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level,) + f"Too many levels: Index has only 1 level," + f" {level} is not a valid level number" ) elif level > 0: raise IndexError( @@ -1651,7 +1640,7 @@ def _get_grouper_for_level(self, mapper, level=None): # Introspection Methods @property - def is_monotonic(self): + def is_monotonic(self) -> bool: """ Alias for is_monotonic_increasing. """ @@ -1675,7 +1664,7 @@ def is_monotonic_increasing(self): return self._engine.is_monotonic_increasing @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return if the index is monotonic decreasing (only equal or decreasing) values. @@ -1692,7 +1681,7 @@ def is_monotonic_decreasing(self): return self._engine.is_monotonic_decreasing @property - def _is_strictly_monotonic_increasing(self): + def _is_strictly_monotonic_increasing(self) -> bool: """ Return if the index is strictly monotonic increasing (only increasing) values. @@ -1709,7 +1698,7 @@ def _is_strictly_monotonic_increasing(self): return self.is_unique and self.is_monotonic_increasing @property - def _is_strictly_monotonic_decreasing(self): + def _is_strictly_monotonic_decreasing(self) -> bool: """ Return if the index is strictly monotonic decreasing (only decreasing) values. @@ -1726,32 +1715,32 @@ def _is_strictly_monotonic_decreasing(self): return self.is_unique and self.is_monotonic_decreasing @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: """ Return if the index has unique values. """ return self._engine.is_unique @property - def has_duplicates(self): + def has_duplicates(self) -> bool: return not self.is_unique - def is_boolean(self): + def is_boolean(self) -> bool: return self.inferred_type in ["boolean"] - def is_integer(self): + def is_integer(self) -> bool: return self.inferred_type in ["integer"] - def is_floating(self): + def is_floating(self) -> bool: return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] - def is_numeric(self): + def is_numeric(self) -> bool: return self.inferred_type in ["integer", "floating"] - def is_object(self): + def is_object(self) -> bool: return is_object_dtype(self.dtype) - def is_categorical(self): + def is_categorical(self) -> bool: """ Check if the Index holds categorical data. @@ -1787,10 +1776,10 @@ def is_categorical(self): """ return self.inferred_type in ["categorical"] - def is_interval(self): + def is_interval(self) -> bool: return self.inferred_type in ["interval"] - def is_mixed(self): + def is_mixed(self) -> bool: return self.inferred_type in ["mixed"] def holds_integer(self): @@ -1807,7 +1796,7 @@ def inferred_type(self): return lib.infer_dtype(self, skipna=False) @cache_readonly - def is_all_dates(self): + def is_all_dates(self) -> bool: return is_datetime_array(ensure_object(self.values)) # -------------------------------------------------------------------- @@ -1869,8 +1858,7 @@ def _isnan(self): @cache_readonly def _nan_idxs(self): if self._can_hold_na: - w, = self._isnan.nonzero() - return w + return self._isnan.nonzero()[0] else: return np.array([], dtype=np.int64) @@ -2673,7 +2661,7 @@ def difference(self, other, sort=None): except TypeError: pass - return this._shallow_copy(the_diff, name=result_name, freq=None) + return this._shallow_copy(the_diff, name=result_name) def symmetric_difference(self, other, result_name=None, sort=None): """ @@ -3392,7 +3380,7 @@ def _reindex_non_unique(self, target): new_indexer = np.arange(len(self.take(indexer))) new_indexer[~check] = -1 - new_index = self._shallow_copy_with_infer(new_labels, freq=None) + new_index = self._shallow_copy_with_infer(new_labels) return new_index, indexer, new_indexer # -------------------------------------------------------------------- @@ -4026,30 +4014,6 @@ def _string_data_error(cls, data): "to explicitly cast to a numeric type" ) - @classmethod - def _coerce_to_ndarray(cls, data): - """ - Coerces data to ndarray. - - Converts other iterables to list first and then to array. - Does not touch ndarrays. - - Raises - ------ - TypeError - When the data passed in is a scalar. - """ - - if not isinstance(data, (np.ndarray, Index)): - if data is None or is_scalar(data): - raise cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (ABCSeries, list, tuple)): - data = list(data) - data = np.asarray(data) - return data - def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. @@ -4087,18 +4051,13 @@ def _assert_can_do_op(self, value): msg = "'value' must be a scalar, passed: {0}" raise TypeError(msg.format(type(value).__name__)) - @property - def _has_complex_internals(self): - # to disable groupby tricks in MultiIndex - return False - - def _is_memory_usage_qualified(self): + def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ return self.is_object() - def is_type_compatible(self, kind): + def is_type_compatible(self, kind) -> bool: """ Whether the index type is compatible with the provided type. """ @@ -4137,14 +4096,14 @@ def is_type_compatible(self, kind): """ @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key) -> bool: hash(key) try: return key in self._engine except (OverflowError, TypeError, ValueError): return False - def contains(self, key): + def contains(self, key) -> bool: """ Return a boolean indicating whether the provided key is in the index. @@ -4205,7 +4164,7 @@ def __getitem__(self, key): else: return result - def _can_hold_identifiers_and_holds_name(self, name): + def _can_hold_identifiers_and_holds_name(self, name) -> bool: """ Faster check for ``name in self`` when we know `name` is a Python identifier (e.g. in NDFrame.__getattr__, which hits this to support @@ -4260,7 +4219,13 @@ def _concat_same_dtype(self, to_concat, name): Concatenate to_concat which has the same class. """ # must be overridden in specific classes - klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, ExtensionArray) + klasses = ( + ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex, + ExtensionArray, + ABCIntervalIndex, + ) to_concat = [ x.astype(object) if isinstance(x, klasses) else x for x in to_concat ] @@ -4296,7 +4261,7 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def equals(self, other): + def equals(self, other) -> bool: """ Determine if two Index objects contain the same elements. @@ -4320,7 +4285,7 @@ def equals(self, other): com.values_from_object(self), com.values_from_object(other) ) - def identical(self, other): + def identical(self, other) -> bool: """ Similar to equals, but check that other comparable attributes are also equal. @@ -4562,7 +4527,7 @@ def shift(self, periods=1, freq=None): '2012-03-01'], dtype='datetime64[ns]', freq='MS') """ - raise NotImplementedError("Not supported for type %s" % type(self).__name__) + raise NotImplementedError(f"Not supported for type {type(self).__name__}") def argsort(self, *args, **kwargs): """ @@ -5069,8 +5034,8 @@ def get_slice_bound(self, label, side, kind): if side not in ("left", "right"): raise ValueError( - "Invalid value for side kwarg," - " must be either 'left' or 'right': %s" % (side,) + f"Invalid value for side kwarg, must be either" + f" 'left' or 'right': {side}" ) original_label = label @@ -5624,7 +5589,7 @@ def _trim_front(strings): def _validate_join_method(method): if method not in ["left", "right", "inner", "outer"]: - raise ValueError("do not recognize join method %s" % method) + raise ValueError(f"do not recognize join method {method}") def default_index(n): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e5a8edb56e413..e0ffc726bc3a1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -276,7 +276,7 @@ def _shallow_copy(self, values=None, dtype=None, **kwargs): dtype = self.dtype return super()._shallow_copy(values=values, dtype=dtype, **kwargs) - def _is_dtype_compat(self, other): + def _is_dtype_compat(self, other) -> bool: """ *this is an internal non-public method* @@ -357,7 +357,7 @@ def _format_attrs(self): ] if self.name is not None: attrs.append(("name", ibase.default_pprint(self.name))) - attrs.append(("dtype", "'%s'" % self.dtype.name)) + attrs.append(("dtype", f"'{self.dtype.name}'")) max_seq_items = get_option("display.max_seq_items") or len(self) if len(self) > max_seq_items: attrs.append(("length", len(self))) @@ -366,7 +366,7 @@ def _format_attrs(self): # -------------------------------------------------------------------- @property - def inferred_type(self): + def inferred_type(self) -> str: return "categorical" @property @@ -407,7 +407,7 @@ def _reverse_indexer(self): return self._data._reverse_indexer() @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key) -> bool: # if key is a NaN, check if any NaN is in self. if is_scalar(key) and isna(key): return self.hasnans @@ -455,7 +455,7 @@ def _engine(self): # introspection @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: return self._engine.is_unique @property @@ -463,7 +463,7 @@ def is_monotonic_increasing(self): return self._engine.is_monotonic_increasing @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: return self._engine.is_monotonic_decreasing @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index bf89bbbdf2b79..e420cf0cb0d78 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -11,7 +11,7 @@ from pandas._libs.algos import unique_deltas from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_int64, @@ -27,7 +27,7 @@ from pandas.core import algorithms, ops from pandas.core.accessor import PandasDelegate -from pandas.core.arrays import ExtensionOpsMixin +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.arrays.datetimelike import ( DatetimeLikeArrayMixin, _ensure_datetimelike_to_i8, @@ -36,7 +36,6 @@ from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.tools.timedeltas import to_timedelta -import pandas.io.formats.printing as printing from pandas.tseries.frequencies import to_offset _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -78,7 +77,7 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): common ops mixin to support a unified interface datetimelike Index """ - _data = None + _data: ExtensionArray # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index @@ -148,7 +147,7 @@ def wrapper(self, other): return wrapper @property - def _ndarray_values(self): + def _ndarray_values(self) -> np.ndarray: return self._data._ndarray_values # ------------------------------------------------------------------------ @@ -496,7 +495,7 @@ def _format_attrs(self): if attrib == "freq": freq = self.freqstr if freq is not None: - freq = "'%s'" % freq + freq = f"{freq!r}" attrs.append(("freq", freq)) return attrs @@ -686,17 +685,13 @@ def _summary(self, name=None): """ formatter = self._formatter_func if len(self) > 0: - index_summary = ", %s to %s" % (formatter(self[0]), formatter(self[-1])) + index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" else: index_summary = "" if name is None: name = type(self).__name__ - result = "%s: %s entries%s" % ( - printing.pprint_thing(name), - len(self), - index_summary, - ) + result = f"{name}: {len(self)} entries{index_summary}" if self.freq: result += "\nFreq: %s" % self.freqstr @@ -737,8 +732,7 @@ def astype(self, dtype, copy=True): # _data.astype call above return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) - @deprecate_kwarg(old_arg_name="n", new_arg_name="periods") - def shift(self, periods, freq=None): + def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -747,7 +741,7 @@ def shift(self, periods, freq=None): Parameters ---------- - periods : int + periods : int, default 1 Number of periods (or increments) to shift by, can be positive or negative. @@ -832,11 +826,11 @@ class DatetimelikeDelegateMixin(PandasDelegate): """ # raw_methods : dispatch methods that shouldn't be boxed in an Index - _raw_methods = set() # type: Set[str] + _raw_methods: Set[str] = set() # raw_properties : dispatch properties that shouldn't be boxed in an Index - _raw_properties = set() # type: Set[str] + _raw_properties: Set[str] = set() name = None - _data = None + _data: ExtensionArray @property def _delegate_class(self): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 2d0ecf1b936da..b6891bc7e2b59 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -410,7 +410,7 @@ def tz(self, value): tzinfo = tz @cache_readonly - def _is_dates_only(self): + def _is_dates_only(self) -> bool: """Return a boolean if we are only dates (and don't have a timezone)""" from pandas.io.formats.format import _is_dates_only @@ -464,6 +464,12 @@ def _convert_for_op(self, value): return _to_M8(value) raise ValueError("Passed item and index have different timezone") + @Appender(Index.difference.__doc__) + def difference(self, other, sort=None): + new_idx = super().difference(other, sort=sort) + new_idx.freq = None + return new_idx + # -------------------------------------------------------------------- # Rendering Methods @@ -485,7 +491,7 @@ def _formatter_func(self): from pandas.io.formats.format import _get_format_datetime64 formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: "'%s'" % formatter(x, tz=self.tz) + return lambda x: f"'{formatter(x, tz=self.tz)}'" # -------------------------------------------------------------------- # Set Operation Methods @@ -657,14 +663,14 @@ def _get_time_micros(self): values = self._data._local_timestamps() return fields.get_time_micros(values) - def to_series(self, keep_tz=None, index=None, name=None): + def to_series(self, keep_tz=lib._no_default, index=None, name=None): """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index. Parameters ---------- - keep_tz : optional, defaults False + keep_tz : optional, defaults True Return the data keeping the timezone. If keep_tz is True: @@ -680,10 +686,10 @@ def to_series(self, keep_tz=None, index=None, name=None): Series will have a datetime64[ns] dtype. TZ aware objects will have the tz removed. - .. versionchanged:: 0.24 - The default value will change to True in a future release. - You can set ``keep_tz=True`` to already obtain the future - behaviour and silence the warning. + .. versionchanged:: 1.0.0 + The default value is now True. In a future version, + this keyword will be removed entirely. Stop passing the + argument to obtain the future behavior and silence the warning. index : Index, optional Index of resulting Series. If None, defaults to original index. @@ -702,27 +708,27 @@ def to_series(self, keep_tz=None, index=None, name=None): if name is None: name = self.name - if keep_tz is None and self.tz is not None: - warnings.warn( - "The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change " - "to True in a future release. You can set " - "'keep_tz=True' to obtain the future behaviour and " - "silence this warning.", - FutureWarning, - stacklevel=2, - ) - keep_tz = False - elif keep_tz is False: - warnings.warn( - "Specifying 'keep_tz=False' is deprecated and this " - "option will be removed in a future release. If " - "you want to remove the timezone information, you " - "can do 'idx.tz_convert(None)' before calling " - "'to_series'.", - FutureWarning, - stacklevel=2, - ) + if keep_tz is not lib._no_default: + if keep_tz: + warnings.warn( + "The 'keep_tz' keyword in DatetimeIndex.to_series " + "is deprecated and will be removed in a future version. " + "You can stop passing 'keep_tz' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + else: + warnings.warn( + "Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", + FutureWarning, + stacklevel=2, + ) + else: + keep_tz = True if keep_tz and self.tz is not None: # preserve the tz & copy @@ -1231,17 +1237,17 @@ def searchsorted(self, value, side="left", sorter=None): return self.values.searchsorted(value, side=side) - def is_type_compatible(self, typ): + def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "datetime" @property - def inferred_type(self): + def inferred_type(self) -> str: # b/c datetime is represented as microseconds since the epoch, make # sure we can't have ambiguous indexing return "datetime64" @property - def is_all_dates(self): + def is_all_dates(self) -> bool: return True def insert(self, loc, item): @@ -1422,7 +1428,7 @@ def date_range( normalize=False, name=None, closed=None, - **kwargs + **kwargs, ): """ Return a fixed frequency DatetimeIndex. @@ -1572,7 +1578,7 @@ def date_range( tz=tz, normalize=normalize, closed=closed, - **kwargs + **kwargs, ) return DatetimeIndex._simple_new(dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) @@ -1588,7 +1594,7 @@ def bdate_range( weekmask=None, holidays=None, closed=None, - **kwargs + **kwargs, ): """ Return a fixed frequency DatetimeIndex, with business day as the default @@ -1681,7 +1687,7 @@ def bdate_range( normalize=normalize, name=name, closed=closed, - **kwargs + **kwargs, ) diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 08c86b81b59c0..2c9521d23f71a 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -11,8 +11,6 @@ import numpy as np -from pandas.util._decorators import deprecate_kwarg - from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.base import PandasObject @@ -109,7 +107,7 @@ def __str__(self) -> str: return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) def __repr__(self) -> str: - return "%s(%s)" % (self.__class__.__name__, str(self)) + return f"{self.__class__.__name__}({str(self)})" __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled pop = append = extend = remove = sort = insert = _disabled @@ -153,9 +151,8 @@ def __repr__(self) -> str: Return a string representation for this object. """ prepr = pprint_thing(self, escape_chars=("\t", "\r", "\n"), quote_strings=True) - return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) + return f"{type(self).__name__}({prepr}, dtype='{self.dtype}')" - @deprecate_kwarg(old_arg_name="v", new_arg_name="value") def searchsorted(self, value, side="left", sorter=None): """ Find indices to insert `value` so as to maintain order. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index bc3c0be08ec12..35e8405e0f1aa 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -343,7 +343,7 @@ def _engine(self): right = self._maybe_convert_i8(self.right) return IntervalTree(left, right, closed=self.closed) - def __contains__(self, key): + def __contains__(self, key) -> bool: """ return a boolean if this key is IN the index We *only* accept an Interval @@ -468,7 +468,7 @@ def itemsize(self): warnings.simplefilter("ignore") return self.left.itemsize + self.right.itemsize - def __len__(self): + def __len__(self) -> int: return len(self.left) @cache_readonly @@ -483,7 +483,7 @@ def _values(self): return self._data @cache_readonly - def _ndarray_values(self): + def _ndarray_values(self) -> np.ndarray: return np.array(self._data) def __array__(self, result=None): @@ -524,12 +524,12 @@ def dtype(self): return self._data.dtype @property - def inferred_type(self): + def inferred_type(self) -> str: """Return a string of the type inferred from the values""" return "interval" @Appender(Index.memory_usage.__doc__) - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: # we don't use an explicit engine # so return the bytes here return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) @@ -542,7 +542,7 @@ def mid(self): return self._data.mid @cache_readonly - def is_monotonic(self): + def is_monotonic(self) -> bool: """ Return True if the IntervalIndex is monotonic increasing (only equal or increasing values), else False @@ -550,7 +550,7 @@ def is_monotonic(self): return self.is_monotonic_increasing @cache_readonly - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: """ Return True if the IntervalIndex is monotonic increasing (only equal or increasing values), else False @@ -558,7 +558,7 @@ def is_monotonic_increasing(self): return self._engine.is_monotonic_increasing @cache_readonly - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return True if the IntervalIndex is monotonic decreasing (only equal or decreasing values), else False @@ -935,7 +935,7 @@ def get_loc( None is specified as these are not yet implemented. """ ) - } + }, ) ) @Appender(_index_shared_docs["get_indexer"]) @@ -1213,7 +1213,7 @@ def _format_space(self): def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two IntervalIndex objects contain the same elements """ @@ -1357,7 +1357,7 @@ def func(self, other, sort=sort): return func @property - def is_all_dates(self): + def is_all_dates(self) -> bool: """ This is False even when left/right contain datetime-like objects, as the check is done on the Interval itself @@ -1374,7 +1374,7 @@ def is_all_dates(self): IntervalIndex._add_logical_methods_disabled() -def _is_valid_endpoint(endpoint): +def _is_valid_endpoint(endpoint) -> bool: """helper for interval_range to check if start/end are valid types""" return any( [ @@ -1386,7 +1386,7 @@ def _is_valid_endpoint(endpoint): ) -def _is_type_compatible(a, b): +def _is_type_compatible(a, b) -> bool: """helper for interval_range to check type compat of start/end/freq""" is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset)) is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset)) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 19769d5b029a1..86398613798be 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -368,7 +368,7 @@ def _verify_integrity(self, codes=None, levels=None): if not level.is_unique: raise ValueError( "Level values must be unique: {values} on " - "level {level}".format(values=[value for value in level], level=i) + "level {level}".format(values=list(level), level=i) ) if self.sortorder is not None: if self.sortorder > self._lexsort_depth(): @@ -675,7 +675,7 @@ def array(self): raise ValueError(msg) @property - def _is_homogeneous_type(self): + def _is_homogeneous_type(self) -> bool: """Whether the levels of a MultiIndex all have the same dtype. This looks at the dtypes of the levels. @@ -956,7 +956,7 @@ def copy( codes=None, deep=False, _set_identity=False, - **kwargs + **kwargs, ): """ Make a copy of this object. Names, dtype, levels and codes can be @@ -1020,12 +1020,12 @@ def _shallow_copy_with_infer(self, values, **kwargs): return MultiIndex( levels=[[] for _ in range(self.nlevels)], codes=[[] for _ in range(self.nlevels)], - **kwargs + **kwargs, ) return self._shallow_copy(values, **kwargs) @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key) -> bool: hash(key) try: self.get_loc(key) @@ -1043,10 +1043,10 @@ def _shallow_copy(self, values=None, **kwargs): return self.copy(**kwargs) @cache_readonly - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype("O") - def _is_memory_usage_qualified(self): + def _is_memory_usage_qualified(self) -> bool: """ return a boolean if we need a qualified .info display """ def f(l): @@ -1055,18 +1055,18 @@ def f(l): return any(f(l) for l in self._inferred_type_levels) @Appender(Index.memory_usage.__doc__) - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: # we are overwriting our base class to avoid # computing .values here which could materialize # a tuple representation unnecessarily return self._nbytes(deep) @cache_readonly - def nbytes(self): + def nbytes(self) -> int: """ return the number of bytes in the underlying data """ return self._nbytes(False) - def _nbytes(self, deep=False): + def _nbytes(self, deep: bool = False) -> int: """ return the number of bytes in the underlying data deeply introspect the level data if deep=True @@ -1217,7 +1217,7 @@ def format( # -------------------------------------------------------------------- - def __len__(self): + def __len__(self) -> int: return len(self.codes[0]) def _get_names(self): @@ -1322,28 +1322,27 @@ def _constructor(self): return MultiIndex.from_tuples @cache_readonly - def inferred_type(self): + def inferred_type(self) -> str: return "mixed" - def _get_level_number(self, level): + def _get_level_number(self, level) -> int: count = self.names.count(level) if (count > 1) and not is_integer(level): raise ValueError( - "The name %s occurs multiple times, use a level number" % level + f"The name {level} occurs multiple times, use a level number" ) try: level = self.names.index(level) except ValueError: if not is_integer(level): - raise KeyError("Level %s not found" % str(level)) + raise KeyError(f"Level {level} not found") elif level < 0: level += self.nlevels if level < 0: orig_level = level - self.nlevels raise IndexError( - "Too many levels: Index has only %d " - "levels, %d is not a valid level number" - % (self.nlevels, orig_level) + f"Too many levels: Index has only {self.nlevels} levels," + f" {orig_level} is not a valid level number" ) # Note: levels are zero-based elif level >= self.nlevels: @@ -1396,13 +1395,8 @@ def values(self): self._tuples = lib.fast_zip(values) return self._tuples - @property - def _has_complex_internals(self): - # to disable groupby tricks - return True - @cache_readonly - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: """ return if the index is monotonic increasing (only equal or increasing) values. @@ -1427,7 +1421,7 @@ def is_monotonic_increasing(self): return Index(self.values).is_monotonic @cache_readonly - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ return if the index is monotonic decreasing (only equal or decreasing) values. @@ -1791,10 +1785,10 @@ def to_flat_index(self): return Index(self.values, tupleize_cols=False) @property - def is_all_dates(self): + def is_all_dates(self) -> bool: return False - def is_lexsorted(self): + def is_lexsorted(self) -> bool: """ Return True if the codes are lexicographically sorted. @@ -1997,8 +1991,8 @@ def levshape(self): def __reduce__(self): """Necessary for making this object picklable""" d = dict( - levels=[lev for lev in self.levels], - codes=[level_codes for level_codes in self.codes], + levels=list(self.levels), + codes=list(self.codes), sortorder=self.sortorder, names=list(self.names), ) @@ -2291,8 +2285,8 @@ def reorder_levels(self, order): order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: raise AssertionError( - "Length of order must be same as " - "number of levels (%d), got %d" % (self.nlevels, len(order)) + f"Length of order must be same as number of levels ({self.nlevels})," + f" got {len(order)}" ) new_levels = [self.levels[i] for i in order] new_codes = [self.codes[i] for i in order] @@ -2604,8 +2598,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): def _partial_tup_index(self, tup, side="left"): if len(tup) > self.lexsort_depth: raise UnsortedIndexError( - "Key length (%d) was greater than MultiIndex" - " lexsort depth (%d)" % (len(tup), self.lexsort_depth) + f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth" + f" ({self.lexsort_depth})" ) n = len(tup) @@ -2616,7 +2610,7 @@ def _partial_tup_index(self, tup, side="left"): if lab not in lev: if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): - raise TypeError("Level type mismatch: %s" % lab) + raise TypeError(f"Level type mismatch: {lab}") # short circuit loc = lev.searchsorted(lab, side=side) @@ -3131,7 +3125,7 @@ def truncate(self, before=None, after=None): return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False) - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two MultiIndex objects have the same labeling information (the levels themselves do not necessarily have to be the same) @@ -3464,7 +3458,7 @@ def isin(self, values, level=None): MultiIndex._add_logical_methods_disabled() -def _sparsify(label_list, start=0, sentinel=""): +def _sparsify(label_list, start: int = 0, sentinel=""): pivoted = list(zip(*label_list)) k = len(label_list) @@ -3492,7 +3486,7 @@ def _sparsify(label_list, start=0, sentinel=""): return list(zip(*result)) -def _get_na_rep(dtype): +def _get_na_rep(dtype) -> str: return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN") diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 12a9201b06283..29f56259dac79 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -15,6 +15,8 @@ is_float_dtype, is_integer_dtype, is_scalar, + is_signed_integer_dtype, + is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, ) @@ -22,10 +24,12 @@ ABCFloat64Index, ABCInt64Index, ABCRangeIndex, + ABCSeries, ABCUInt64Index, ) from pandas.core.dtypes.missing import isna +from pandas._typing import Dtype from pandas.core import algorithms import pandas.core.common as com from pandas.core.indexes.base import Index, InvalidIndexError, _index_shared_docs @@ -44,7 +48,7 @@ class NumericIndex(Index): _is_numeric_dtype = True def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): - + cls._validate_dtype(dtype) if fastpath is not None: warnings.warn( "The 'fastpath' keyword is deprecated, and will be " @@ -55,8 +59,16 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): if fastpath: return cls._simple_new(data, name=name) - # is_scalar, generators handled in coerce_to_ndarray - data = cls._coerce_to_ndarray(data) + # Coerce to ndarray if not already ndarray or Index + if not isinstance(data, (np.ndarray, Index)): + if is_scalar(data): + raise cls._scalar_data_error(data) + + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + + data = np.asarray(data, dtype=dtype) if issubclass(data.dtype.type, str): cls._string_data_error(data) @@ -71,6 +83,22 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): name = data.name return cls._simple_new(subarr, name=name) + @classmethod + def _validate_dtype(cls, dtype: Dtype) -> None: + if dtype is None: + return + validation_metadata = { + "int64index": (is_signed_integer_dtype, "signed integer"), + "uint64index": (is_unsigned_integer_dtype, "unsigned integer"), + "float64index": (is_float_dtype, "float"), + "rangeindex": (is_signed_integer_dtype, "signed integer"), + } + + validation_func, expected = validation_metadata[cls._typ] + if not validation_func(dtype): + msg = f"Incorrect `dtype` passed: expected {expected}, received {dtype}" + raise ValueError(msg) + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): assert kind in ["ix", "loc", "getitem", None] @@ -134,7 +162,7 @@ def _concat_same_dtype(self, indexes, name): return result.rename(name) @property - def is_all_dates(self): + def is_all_dates(self) -> bool: """ Checks that all the labels are datetime objects. """ @@ -206,7 +234,7 @@ class IntegerIndex(NumericIndex): This is an abstract class for Int64Index, UInt64Index. """ - def __contains__(self, key): + def __contains__(self, key) -> bool: """ Check if key is a float and has a decimal. If it has, return False. """ @@ -228,12 +256,12 @@ class Int64Index(IntegerIndex): _default_dtype = np.int64 @property - def inferred_type(self): + def inferred_type(self) -> str: """Always 'integer' for ``Int64Index``""" return "integer" @property - def asi8(self): + def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self.values.view("i8") @@ -283,12 +311,12 @@ class UInt64Index(IntegerIndex): _default_dtype = np.uint64 @property - def inferred_type(self): + def inferred_type(self) -> str: """Always 'integer' for ``UInt64Index``""" return "integer" @property - def asi8(self): + def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self.values.view("u8") @@ -356,7 +384,7 @@ class Float64Index(NumericIndex): _default_dtype = np.float64 @property - def inferred_type(self): + def inferred_type(self) -> str: """Always 'floating' for ``Float64Index``""" return "floating" @@ -425,7 +453,7 @@ def get_value(self, series, key): return new_values - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ @@ -447,7 +475,7 @@ def equals(self, other): except (TypeError, ValueError): return False - def __contains__(self, other): + def __contains__(self, other) -> bool: if super().__contains__(other): return True @@ -482,7 +510,7 @@ def get_loc(self, key, method=None, tolerance=None): return super().get_loc(key, method=method, tolerance=tolerance) @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: return super().is_unique and self._nan_idxs.size < 2 @Appender(Index.isin.__doc__) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ca7be9ba512da..cae1380e930f1 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -174,7 +174,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): _is_numeric_dtype = False _infer_as_myclass = True - _data = None + _data: PeriodArray _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True @@ -194,7 +194,7 @@ def __new__( dtype=None, copy=False, name=None, - **fields + **fields, ): valid_field_set = { @@ -310,7 +310,7 @@ def values(self): return np.asarray(self) @property - def freq(self): + def freq(self) -> DateOffset: return self._data.freq @freq.setter @@ -447,7 +447,7 @@ def _engine(self): return self._engine_type(period, len(self)) @Appender(_index_shared_docs["contains"]) - def __contains__(self, key): + def __contains__(self, key) -> bool: if isinstance(key, Period): if key.freq != self.freq: return False @@ -574,11 +574,11 @@ def searchsorted(self, value, side="left", sorter=None): return self._ndarray_values.searchsorted(value, side=side, sorter=sorter) @property - def is_all_dates(self): + def is_all_dates(self) -> bool: return True @property - def is_full(self): + def is_full(self) -> bool: """ Returns True if this PeriodIndex is range-like in that all Periods between start and end are present, in order. @@ -591,7 +591,7 @@ def is_full(self): return ((values[1:] - values[:-1]) < 2).all() @property - def inferred_type(self): + def inferred_type(self) -> str: # b/c data is represented as ints make sure we can't have ambiguous # indexing return "period" @@ -995,7 +995,9 @@ def memory_usage(self, deep=False): PeriodIndex._add_datetimelike_methods() -def period_range(start=None, end=None, periods=None, freq=None, name=None): +def period_range( + start=None, end=None, periods=None, freq=None, name=None +) -> PeriodIndex: """ Return a fixed frequency PeriodIndex. diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5fa3431fc97c0..e68b340130b9b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Union +from typing import Optional, Union import warnings import numpy as np @@ -14,14 +14,13 @@ from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, - is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_scalar, is_timedelta64_dtype, ) -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCTimedeltaIndex +from pandas.core.dtypes.generic import ABCTimedeltaIndex from pandas.core import ops import pandas.core.common as com @@ -29,6 +28,7 @@ import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.io.formats.printing import pprint_thing @@ -73,10 +73,10 @@ class RangeIndex(Int64Index): _typ = "rangeindex" _engine_type = libindex.Int64Engine - _range = None # type: range + _range: range # check whether self._data has been called - _cached_data = None # type: np.ndarray + _cached_data: Optional[np.ndarray] = None # -------------------------------------------------------------------- # Constructors @@ -146,7 +146,7 @@ def from_range(cls, data, name=None, dtype=None): return cls._simple_new(data, dtype=dtype, name=name) @classmethod - def _simple_new(cls, values, name=None, dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None): result = object.__new__(cls) # handle passed None, non-integers @@ -154,25 +154,16 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): # empty values = range(0, 0, 1) elif not isinstance(values, range): - return Index(values, dtype=dtype, name=name, **kwargs) + return Index(values, dtype=dtype, name=name) result._range = values - result.name = name - for k, v in kwargs.items(): - setattr(result, k, v) result._reset_identity() return result # -------------------------------------------------------------------- - @staticmethod - def _validate_dtype(dtype): - """ require dtype to be None or int64 """ - if not (dtype is None or is_int64_dtype(dtype)): - raise TypeError("Invalid to pass a non-int64 dtype to RangeIndex") - @cache_readonly def _constructor(self): """ return the class to use for construction """ @@ -304,7 +295,7 @@ def _step(self): return self.step @cache_readonly - def nbytes(self): + def nbytes(self) -> int: """ Return the number of bytes in the underlying data. """ @@ -314,7 +305,7 @@ def nbytes(self): for attr_name in ["start", "stop", "step"] ) - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: """ Memory usage of my values @@ -340,24 +331,24 @@ def memory_usage(self, deep=False): return self.nbytes @property - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype(np.int64) @property - def is_unique(self): + def is_unique(self) -> bool: """ return if the index has unique values """ return True @cache_readonly - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: return self._range.step > 0 or len(self) <= 1 @cache_readonly - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: return self._range.step < 0 or len(self) <= 1 @property - def has_duplicates(self): + def has_duplicates(self) -> bool: return False def __contains__(self, key: Union[int, np.integer]) -> bool: @@ -663,7 +654,7 @@ def _concat_same_dtype(self, indexes, name): non_empty_indexes = [obj for obj in indexes if len(obj)] for obj in non_empty_indexes: - rng = obj._range # type: range + rng: range = obj._range if start is None: # This is set by the first non-empty index @@ -698,14 +689,14 @@ def _concat_same_dtype(self, indexes, name): # In this case return an empty range index. return RangeIndex(0, 0).rename(name) - def __len__(self): + def __len__(self) -> int: """ return the length of the RangeIndex """ return len(self._range) @property - def size(self): + def size(self) -> int: return len(self) def __getitem__(self, key): @@ -734,9 +725,8 @@ def __getitem__(self, key): # fall back to Int64Index return super().__getitem__(key) + @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented if is_integer(other) and other != 0: if len(self) == 0 or self.start % other == 0 and self.step % other == 0: @@ -772,10 +762,9 @@ def _make_evaluate_binop(op, step=False): if False, use the existing step """ + @unpack_zerodim_and_defer(op.__name__) def _evaluate_numeric_binop(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - elif isinstance(other, ABCTimedeltaIndex): + if isinstance(other, ABCTimedeltaIndex): # Defer to TimedeltaIndex implementation return NotImplemented elif isinstance(other, (timedelta, np.timedelta64)): diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 2324b8cf74c46..1fd824235c2be 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -406,6 +406,12 @@ def intersection(self, other, sort=False): """ return super().intersection(other, sort=sort) + @Appender(Index.difference.__doc__) + def difference(self, other, sort=None): + new_idx = super().difference(other, sort=sort) + new_idx.freq = None + return new_idx + def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) if ( @@ -598,15 +604,15 @@ def searchsorted(self, value, side="left", sorter=None): return self.values.searchsorted(value, side=side, sorter=sorter) - def is_type_compatible(self, typ): + def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "timedelta" @property - def inferred_type(self): + def inferred_type(self) -> str: return "timedelta64" @property - def is_all_dates(self): + def is_all_dates(self) -> bool: return True def insert(self, loc, item): @@ -693,7 +699,7 @@ def delete(self, loc): TimedeltaIndex._add_datetimelike_methods() -def _is_convertible_to_index(other): +def _is_convertible_to_index(other) -> bool: """ return a boolean whether I can attempt conversion to a TimedeltaIndex """ @@ -713,7 +719,7 @@ def _is_convertible_to_index(other): def timedelta_range( start=None, end=None, periods=None, freq=None, name=None, closed=None -): +) -> TimedeltaIndex: """ Return a fixed frequency TimedeltaIndex, with day as the default frequency. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7db54f4305c2e..b52015b738c6e 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -100,7 +100,7 @@ class IndexingError(Exception): class _NDFrameIndexer(_NDFrameIndexerBase): - _valid_types = None # type: str + _valid_types: str axis = None def __call__(self, axis=None): @@ -319,7 +319,7 @@ def _setitem_with_indexer(self, indexer, value): # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value if not take_split_path and self.obj._data.blocks: - blk, = self.obj._data.blocks + (blk,) = self.obj._data.blocks if 1 < blk.ndim: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value take_split_path = not blk._can_hold_element(val) @@ -1111,7 +1111,7 @@ def _getitem_iterable(self, key, axis: int): if com.is_bool_indexer(key): # A boolean indexer key = check_bool_indexer(labels, key) - inds, = key.nonzero() + (inds,) = key.nonzero() return self.obj.take(inds, axis=axis) else: # A collection of keys @@ -1255,7 +1255,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): if com.is_bool_indexer(obj): obj = check_bool_indexer(labels, obj) - inds, = obj.nonzero() + (inds,) = obj.nonzero() return inds else: # When setting, missing keys are not allowed, even with .loc: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ff35d752150f9..966258d965681 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -251,21 +251,13 @@ def make_block(self, values, placement=None): return make_block(values, placement=placement, ndim=self.ndim) - def make_block_same_class(self, values, placement=None, ndim=None, dtype=None): + def make_block_same_class(self, values, placement=None, ndim=None): """ Wrap given values in a block of same type as self. """ - if dtype is not None: - # issue 19431 fastparquet is passing this - warnings.warn( - "dtype argument is deprecated, will be removed in a future release.", - FutureWarning, - ) if placement is None: placement = self.mgr_locs if ndim is None: ndim = self.ndim - return make_block( - values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype - ) + return make_block(values, placement=placement, ndim=ndim, klass=self.__class__) def __repr__(self) -> str: # don't want to print out all of the items here @@ -288,7 +280,7 @@ def __repr__(self) -> str: return result - def __len__(self): + def __len__(self) -> int: return len(self.values) def __getstate__(self): @@ -1099,7 +1091,7 @@ def interpolate( fill_value=None, coerce=False, downcast=None, - **kwargs + **kwargs, ): inplace = validate_bool_kwarg(inplace, "inplace") @@ -1149,7 +1141,7 @@ def check_int_bool(self, inplace): fill_value=fill_value, inplace=inplace, downcast=downcast, - **kwargs + **kwargs, ) def _interpolate_with_fill( @@ -1204,7 +1196,7 @@ def _interpolate( limit_area=None, inplace=False, downcast=None, - **kwargs + **kwargs, ): """ interpolate using scipy wrappers """ @@ -1242,7 +1234,7 @@ def func(x): limit_area=limit_area, fill_value=fill_value, bounds_error=False, - **kwargs + **kwargs, ) # interp each column independently @@ -2027,7 +2019,7 @@ def to_native_types( float_format=None, decimal=".", quoting=None, - **kwargs + **kwargs, ): """ convert to our native types format, slicing if desired """ @@ -2836,6 +2828,8 @@ def _replace_coerce( if convert: block = [b.convert(numeric=False, copy=True) for b in block] return block + if convert: + return [self.convert(numeric=False, copy=True)] return self @@ -2935,6 +2929,30 @@ def where( ) return result + def replace( + self, + to_replace, + value, + inplace: bool = False, + filter=None, + regex: bool = False, + convert: bool = True, + ): + inplace = validate_bool_kwarg(inplace, "inplace") + result = self if inplace else self.copy() + if filter is None: # replace was called on a series + result.values.replace(to_replace, value, inplace=True) + if convert: + return result.convert(numeric=False, copy=not inplace) + else: + return result + else: # replace was called on a DataFrame + if not isna(value): + result.values.add_categories(value, inplace=True) + return super(CategoricalBlock, result).replace( + to_replace, value, inplace, filter, regex, convert + ) + # ----------------------------------------------------------------- # Constructor Helpers @@ -2986,7 +3004,7 @@ def get_block_type(values, dtype=None): return cls -def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=None): +def make_block(values, placement, klass=None, ndim=None, dtype=None): # Ensure that we don't allow PandasArray / PandasDtype in internals. # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): @@ -2997,12 +3015,6 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=No if isinstance(dtype, PandasDtype): dtype = dtype.numpy_dtype - if fastpath is not None: - # GH#19265 pyarrow is passing this - warnings.warn( - "fastpath argument is deprecated, will be removed in a future release.", - FutureWarning, - ) if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 05a2803b3fc2f..6d518aa1abeb9 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -2,14 +2,12 @@ Functions for preparing various inputs passed to the DataFrame or Series constructors before passing them to a BlockManager. """ -from collections import OrderedDict, abc +from collections import abc import numpy as np import numpy.ma as ma from pandas._libs import lib -import pandas.compat as compat -from pandas.compat import PY36 from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -39,13 +37,9 @@ from pandas.core import algorithms, common as com from pandas.core.arrays import Categorical from pandas.core.construction import sanitize_array -from pandas.core.index import ( - Index, - _get_objs_combined_axis, - _union_indexes, - ensure_index, -) +from pandas.core.index import Index, ensure_index, get_objs_combined_axis from pandas.core.indexes import base as ibase +from pandas.core.indexes.api import union_indexes from pandas.core.internals import ( create_block_manager_from_arrays, create_block_manager_from_blocks, @@ -235,7 +229,7 @@ def init_dict(data, index, columns, dtype=None): arrays.loc[missing] = [val] * missing.sum() else: - keys = com.dict_keys_to_ordered_list(data) + keys = list(data.keys()) columns = data_names = Index(keys) arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) # GH#24096 need copy to be deep for datetime64tz case @@ -331,7 +325,6 @@ def extract_index(data): have_raw_arrays = False have_series = False have_dicts = False - have_ordered = False for val in data: if isinstance(val, ABCSeries): @@ -339,8 +332,6 @@ def extract_index(data): indexes.append(val.index) elif isinstance(val, dict): have_dicts = True - if isinstance(val, OrderedDict): - have_ordered = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True @@ -350,9 +341,9 @@ def extract_index(data): raise ValueError("If using all scalar values, you must pass an index") if have_series: - index = _union_indexes(indexes) + index = union_indexes(indexes) elif have_dicts: - index = _union_indexes(indexes, sort=not (compat.PY36 or have_ordered)) + index = union_indexes(indexes, sort=False) if have_raw_arrays: lengths = list(set(raw_lengths)) @@ -498,7 +489,9 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: - columns = _get_objs_combined_axis(data, sort=False) + # We know pass_data is non-empty because data[0] is a Series + pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] + columns = get_objs_combined_axis(pass_data, sort=False) indexer_cache = {} @@ -531,7 +524,7 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): """Convert list of dicts to numpy arrays if `columns` is not passed, column names are inferred from the records - - for OrderedDict and (on Python>=3.6) dicts, the column names match + - for OrderedDict and dicts, the column names match the key insertion-order from the first record to the last. - For other kinds of dict-likes, the keys are lexically sorted. @@ -548,10 +541,10 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): tuple arrays, columns """ + if columns is None: gen = (list(x.keys()) for x in data) - types = (dict, OrderedDict) if PY36 else OrderedDict - sort = not any(isinstance(d, types) for d in data) + sort = not any(isinstance(d, dict) for d in data) columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) # assure that they are of the base dict class and not of derived diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0e97e55acddad..5e60440f1577e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -79,7 +79,6 @@ class BlockManager(PandasObject): copy(deep=True) get_dtype_counts - get_ftype_counts get_dtypes get_ftypes @@ -127,7 +126,7 @@ def __init__( do_integrity_check: bool = True, ): self.axes = [ensure_index(ax) for ax in axes] - self.blocks = tuple(blocks) # type: Tuple[Block, ...] + self.blocks: Tuple[Block, ...] = tuple(blocks) for block in blocks: if self.ndim != block.ndim: @@ -166,7 +165,7 @@ def shape(self): return tuple(len(ax) for ax in self.axes) @property - def ndim(self): + def ndim(self) -> int: return len(self.axes) def set_axis(self, axis, new_labels): @@ -246,9 +245,6 @@ def _get_counts(self, f): def get_dtype_counts(self): return self._get_counts(lambda b: b.dtype.name) - def get_ftype_counts(self): - return self._get_counts(lambda b: b.ftype) - def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) return algos.take_1d(dtypes, self._blknos, allow_fill=False) @@ -260,7 +256,7 @@ def get_ftypes(self): def __getstate__(self): block_values = [b.values for b in self.blocks] block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] - axes_array = [ax for ax in self.axes] + axes_array = list(self.axes) extra_state = { "0.14.1": { @@ -319,7 +315,7 @@ def _post_setstate(self): self._known_consolidated = False self._rebuild_blknos_and_blklocs() - def __len__(self): + def __len__(self) -> int: return len(self.items) def __repr__(self) -> str: @@ -354,7 +350,7 @@ def apply( filter=None, do_integrity_check=False, consolidate=True, - **kwargs + **kwargs, ): """ iterate over the blocks, collect and create a new block manager @@ -629,7 +625,7 @@ def comp(s, regex=False): convert=convert, regex=regex, ) - if m.any(): + if m.any() or convert: new_rb = _extend_blocks(result, new_rb) else: new_rb.append(b) @@ -1394,12 +1390,12 @@ def equals(self, other): if len(self.blocks) != len(other.blocks): return False - # canonicalize block order, using a tuple combining the type - # name and then mgr_locs because there might be unconsolidated + # canonicalize block order, using a tuple combining the mgr_locs + # then type name because there might be unconsolidated # blocks (say, Categorical) which can only be distinguished by # the iteration order def canonicalize(block): - return (block.dtype.name, block.mgr_locs.as_array.tolist()) + return (block.mgr_locs.as_array.tolist(), block.dtype.name) self_blocks = sorted(self.blocks, key=canonicalize) other_blocks = sorted(other.blocks, key=canonicalize) @@ -1555,9 +1551,6 @@ def ftype(self): def get_dtype_counts(self): return {self.dtype.name: 1} - def get_ftype_counts(self): - return {self.ftype: 1} - def get_dtypes(self): return np.array([self._block.dtype]) @@ -1860,7 +1853,7 @@ def _shape_compat(x): def _interleaved_dtype( - blocks: List[Block] + blocks: List[Block], ) -> Optional[Union[np.dtype, ExtensionDtype]]: """Find the common dtype for `blocks`. diff --git a/pandas/core/missing.py b/pandas/core/missing.py index fb148b39c8a86..fc54c03c042b7 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -164,7 +164,7 @@ def interpolate_1d( fill_value=None, bounds_error=False, order=None, - **kwargs + **kwargs, ): """ Logic for the 1-d interpolation. The result should be 1-d, inputs @@ -300,7 +300,7 @@ def interpolate_1d( fill_value=fill_value, bounds_error=bounds_error, order=order, - **kwargs + **kwargs, ) result[preserve_nans] = np.nan return result diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 7e50348962fc5..a2a40bbf93604 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -660,7 +660,7 @@ def _get_counts_nanvar( count = np.nan d = np.nan else: - mask2 = count <= ddof # type: np.ndarray + mask2: np.ndarray = count <= ddof if mask2.any(): np.putmask(d, mask2, np.nan) np.putmask(count, mask2, np.nan) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 398fa9b0c1fc0..d14fb040c4e30 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -29,6 +29,7 @@ logical_op, ) from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas.core.ops.dispatch import should_series_dispatch from pandas.core.ops.docstrings import ( @@ -181,41 +182,6 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): # ----------------------------------------------------------------------------- -def _gen_eval_kwargs(name): - """ - Find the keyword arguments to pass to numexpr for the given operation. - - Parameters - ---------- - name : str - - Returns - ------- - eval_kwargs : dict - - Examples - -------- - >>> _gen_eval_kwargs("__add__") - {} - - >>> _gen_eval_kwargs("rtruediv") - {'reversed': True, 'truediv': True} - """ - kwargs = {} - - # Series appear to only pass __add__, __radd__, ... - # but DataFrame gets both these dunder names _and_ non-dunder names - # add, radd, ... - name = name.replace("__", "") - - if name.startswith("r"): - if name not in ["radd", "rand", "ror", "rxor"]: - # Exclude commutative operations - kwargs["reversed"] = True - - return kwargs - - def _get_frame_op_default_axis(name): """ Only DataFrame cares about default_axis, specifically: @@ -487,17 +453,15 @@ def _arith_method_SERIES(cls, op, special): """ str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - eval_kwargs = _gen_eval_kwargs(op_name) + @unpack_zerodim_and_defer(op_name) def wrapper(left, right): - if isinstance(right, ABCDataFrame): - return NotImplemented left, right = _align_method_SERIES(left, right) res_name = get_op_result_name(left, right) lvalues = extract_array(left, extract_numpy=True) - result = arithmetic_op(lvalues, right, op, str_rep, eval_kwargs) + result = arithmetic_op(lvalues, right, op, str_rep) return _construct_result(left, result, index=left.index, name=res_name) @@ -512,14 +476,11 @@ def _comp_method_SERIES(cls, op, special): """ op_name = _get_op_name(op, special) + @unpack_zerodim_and_defer(op_name) def wrapper(self, other): res_name = get_op_result_name(self, other) - if isinstance(other, ABCDataFrame): # pragma: no cover - # Defer to DataFrame implementation; fail early - return NotImplemented - if isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") @@ -541,14 +502,11 @@ def _bool_method_SERIES(cls, op, special): """ op_name = _get_op_name(op, special) + @unpack_zerodim_and_defer(op_name) def wrapper(self, other): self, other = _align_method_SERIES(self, other, align_asobject=True) res_name = get_op_result_name(self, other) - if isinstance(other, ABCDataFrame): - # Defer to DataFrame implementation; fail early - return NotImplemented - lvalues = extract_array(self, extract_numpy=True) rvalues = extract_array(other, extract_numpy=True) @@ -688,10 +646,9 @@ def to_series(right): def _arith_method_FRAME(cls, op, special): str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - eval_kwargs = _gen_eval_kwargs(op_name) default_axis = _get_frame_op_default_axis(op_name) - na_op = define_na_arithmetic_op(op, str_rep, eval_kwargs) + na_op = define_na_arithmetic_op(op, str_rep) is_logical = str_rep in ["&", "|", "^"] if op_name in _op_descriptions: diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 46c3b8b575af9..414e241af7bbd 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -3,7 +3,7 @@ ExtensionArrays. """ import operator -from typing import Any, Mapping, Union +from typing import Any, Union import numpy as np @@ -118,14 +118,14 @@ def masked_arith_op(x, y, op): return result -def define_na_arithmetic_op(op, str_rep: str, eval_kwargs): +def define_na_arithmetic_op(op, str_rep: str): def na_op(x, y): - return na_arithmetic_op(x, y, op, str_rep, eval_kwargs) + return na_arithmetic_op(x, y, op, str_rep) return na_op -def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): +def na_arithmetic_op(left, right, op, str_rep: str): """ Return the result of evaluating op on the passed in values. @@ -136,7 +136,6 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): left : np.ndarray right : np.ndarray or scalar str_rep : str or None - eval_kwargs : kwargs to pass to expressions Returns ------- @@ -149,7 +148,7 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): import pandas.core.computation.expressions as expressions try: - result = expressions.evaluate(op, str_rep, left, right, **eval_kwargs) + result = expressions.evaluate(op, str_rep, left, right) except TypeError: result = masked_arith_op(left, right, op) @@ -157,11 +156,7 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): def arithmetic_op( - left: Union[np.ndarray, ABCExtensionArray], - right: Any, - op, - str_rep: str, - eval_kwargs: Mapping[str, bool], + left: Union[np.ndarray, ABCExtensionArray], right: Any, op, str_rep: str ): """ Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... @@ -212,7 +207,7 @@ def arithmetic_op( else: with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep, eval_kwargs) + res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep) return res_values diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py new file mode 100644 index 0000000000000..f4b16cf4a0cf2 --- /dev/null +++ b/pandas/core/ops/common.py @@ -0,0 +1,66 @@ +""" +Boilerplate functions used in defining binary operations. +""" +from functools import wraps + +from pandas._libs.lib import item_from_zerodim + +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries + + +def unpack_zerodim_and_defer(name: str): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Parameters + ---------- + name : str + + Returns + ------- + decorator + """ + + def wrapper(method): + return _unpack_zerodim_and_defer(method, name) + + return wrapper + + +def _unpack_zerodim_and_defer(method, name: str): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Ensure method returns NotImplemented when operating against "senior" + classes. Ensure zero-dimensional ndarrays are always unpacked. + + Parameters + ---------- + method : binary method + name : str + + Returns + ------- + method + """ + + is_cmp = name.strip("__") in {"eq", "ne", "lt", "le", "gt", "ge"} + + @wraps(method) + def new_method(self, other): + + if is_cmp and isinstance(self, ABCIndexClass) and isinstance(other, ABCSeries): + # For comparison ops, Index does *not* defer to Series + pass + else: + for cls in [ABCDataFrame, ABCSeries, ABCIndexClass]: + if isinstance(self, cls): + break + if isinstance(other, cls): + return NotImplemented + + other = item_from_zerodim(other) + + return method(self, other) + + return new_method diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 5d3f9cd92aa1a..e3db65f11a332 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -233,7 +233,7 @@ def _make_flex_doc(op_name, typ): dtype: float64 """ -_op_descriptions = { +_op_descriptions: Dict[str, Dict[str, Optional[str]]] = { # Arithmetic Operators "add": { "op": "+", @@ -310,7 +310,7 @@ def _make_flex_doc(op_name, typ): "reverse": None, "series_examples": None, }, -} # type: Dict[str, Dict[str, Optional[str]]] +} _op_names = list(_op_descriptions.keys()) for key in _op_names: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6d877bf666881..25731c4e1c54c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -21,7 +21,7 @@ from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, groupby +from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range @@ -31,7 +31,7 @@ from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset, Day, Nano, Tick -_shared_docs_kwargs = dict() # type: Dict[str, str] +_shared_docs_kwargs: Dict[str, str] = dict() class Resampler(_GroupBy, ShallowMixin): @@ -187,6 +187,7 @@ def _get_binner(self): """ binner, bins, binlabels = self._get_binner_for_time() + assert len(bins) == len(binlabels) bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer) return binner, bin_grouper @@ -334,7 +335,7 @@ def _gotitem(self, key, ndim, subset=None): grouper = self.grouper if subset is None: subset = self.obj - grouped = groupby(subset, by=None, grouper=grouper, axis=self.axis) + grouped = get_groupby(subset, by=None, grouper=grouper, axis=self.axis) # try the key selection try: @@ -353,7 +354,7 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): obj = self._selected_obj - grouped = groupby(obj, by=None, grouper=grouper, axis=self.axis) + grouped = get_groupby(obj, by=None, grouper=grouper, axis=self.axis) try: if isinstance(obj, ABCDataFrame) and callable(how): @@ -793,7 +794,7 @@ def interpolate( limit_direction="forward", limit_area=None, downcast=None, - **kwargs + **kwargs, ): """ Interpolate values according to different methods. @@ -807,7 +808,7 @@ def interpolate( limit_direction=limit_direction, limit_area=limit_area, downcast=downcast, - **kwargs + **kwargs, ) def asfreq(self, fill_value=None): @@ -868,13 +869,32 @@ def var(self, ddof=1, *args, **kwargs): @Appender(GroupBy.size.__doc__) def size(self): - # It's a special case as higher level does return - # a copy of 0-len objects. GH14962 result = self._downsample("size") - if not len(self.ax) and isinstance(self._selected_obj, ABCDataFrame): + if not len(self.ax): from pandas import Series - result = Series([], index=result.index, dtype="int64") + if self._selected_obj.ndim == 1: + name = self._selected_obj.name + else: + name = None + result = Series([], index=result.index, dtype="int64", name=name) + return result + + @Appender(GroupBy.count.__doc__) + def count(self): + result = self._downsample("count") + if not len(self.ax): + if self._selected_obj.ndim == 1: + result = self._selected_obj.__class__( + [], index=result.index, dtype="int64", name=self._selected_obj.name + ) + else: + from pandas import DataFrame + + result = DataFrame( + [], index=result.index, columns=result.columns, dtype="int64" + ) + return result def quantile(self, q=0.5, **kwargs): @@ -922,14 +942,6 @@ def g(self, _method=method, *args, **kwargs): g.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, g) -# groupby & aggregate methods -for method in ["count"]: - - def h(self, _method=method): - return self._downsample(_method) - - h.__doc__ = getattr(GroupBy, method).__doc__ - setattr(Resampler, method, h) # series only methods for method in ["nunique"]: @@ -1369,7 +1381,7 @@ def __init__( kind=None, convention=None, base=0, - **kwargs + **kwargs, ): # Check for correctness of the keyword arguments which would # otherwise silently use the default if misspelled diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 772ac1cd93059..c2322ae626cfd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -2,6 +2,7 @@ concat routines """ +from typing import List import warnings import numpy as np @@ -13,11 +14,11 @@ ) import pandas.core.common as com from pandas.core.generic import NDFrame -from pandas.core.index import ( - _all_indexes_same, - _get_consensus_names, - _get_objs_combined_axis, +from pandas.core.indexes.api import ( + all_indexes_same, ensure_index, + get_consensus_names, + get_objs_combined_axis, ) import pandas.core.indexes.base as ibase from pandas.core.internals import concatenate_block_managers @@ -293,7 +294,7 @@ def __init__( if isinstance(objs, dict): if keys is None: - keys = com.dict_keys_to_ordered_list(objs) + keys = list(objs.keys()) objs = [objs[k] for k in keys] else: objs = list(objs) @@ -437,13 +438,13 @@ def get_result(self): mgr = self.objs[0]._data.concat( [x._data for x in self.objs], self.new_axes ) - cons = _get_series_result_type(mgr, self.objs) + cons = self.objs[0]._constructor return cons(mgr, name=name).__finalize__(self, method="concat") # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) - cons = _get_series_result_type(data) + cons = DataFrame index, columns = self.new_axes df = cons(data, index=index) @@ -473,7 +474,7 @@ def get_result(self): if not self.copy: new_data._consolidate_inplace() - cons = _get_frame_result_type(new_data, self.objs) + cons = self.objs[0]._constructor return cons._from_axes(new_data, self.new_axes).__finalize__( self, method="concat" ) @@ -520,17 +521,13 @@ def _get_new_axes(self): new_axes[self.axis] = self._get_concat_axis() return new_axes - def _get_comb_axis(self, i): + def _get_comb_axis(self, i: int) -> Index: data_axis = self.objs[0]._get_block_manager_axis(i) - try: - return _get_objs_combined_axis( - self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort - ) - except IndexError: - types = [type(x).__name__ for x in self.objs] - raise TypeError("Cannot concatenate list of {types}".format(types=types)) + return get_objs_combined_axis( + self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort + ) - def _get_concat_axis(self): + def _get_concat_axis(self) -> Index: """ Return index to be used along concatenation axis. """ @@ -541,7 +538,7 @@ def _get_concat_axis(self): idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: - names = [None] * len(self.objs) + names: List = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): @@ -617,7 +614,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde else: levels = [ensure_index(x) for x in levels] - if not _all_indexes_same(indexes): + if not all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes @@ -660,7 +657,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde ) # also copies - names = names + _get_consensus_names(indexes) + names = names + get_consensus_names(indexes) return MultiIndex( levels=levels, codes=codes_list, names=names, verify_integrity=False @@ -706,27 +703,3 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde return MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) - - -def _get_series_result_type(result, objs=None): - """ - return appropriate class of Series concat - input is either dict or array-like - """ - # TODO: See if we can just inline with _constructor_expanddim - # now that sparse is removed. - - # concat Series with axis 1 - if isinstance(result, dict): - return DataFrame - - # otherwise it is a SingleBlockManager (axis = 0) - return objs[0]._constructor - - -def _get_frame_result_type(result, objs): - """ - return appropriate class of DataFrame-like concat - """ - # TODO: just inline this as _constructor. - return objs[0] diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 16c04454898db..8e9edfa5f1409 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -1,4 +1,5 @@ import re +from typing import List import numpy as np @@ -10,7 +11,8 @@ from pandas.core.dtypes.missing import notna from pandas.core.arrays import Categorical -from pandas.core.frame import _shared_docs +import pandas.core.common as com +from pandas.core.frame import DataFrame, _shared_docs from pandas.core.indexes.base import Index from pandas.core.reshape.concat import concat from pandas.core.tools.numeric import to_numeric @@ -21,13 +23,13 @@ % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") ) def melt( - frame, + frame: DataFrame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, -): +) -> DataFrame: # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` @@ -35,6 +37,7 @@ def melt( cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) + if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] @@ -45,7 +48,7 @@ def melt( else: # Check that `id_vars` are in frame id_vars = list(id_vars) - missing = Index(np.ravel(id_vars)).difference(cols) + missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError( "The following 'id_vars' are not present" @@ -67,7 +70,7 @@ def melt( else: value_vars = list(value_vars) # Check that `value_vars` are in frame - missing = Index(np.ravel(value_vars)).difference(cols) + missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError( "The following 'value_vars' are not present in" @@ -119,7 +122,7 @@ def melt( return frame._constructor(mdata, columns=mcolumns) -def lreshape(data, groups, dropna=True, label=None): +def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame: """ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot @@ -129,6 +132,8 @@ def lreshape(data, groups, dropna=True, label=None): groups : dict {new_name : list_of_columns} dropna : boolean, default True + label : object, default None + Dummy kwarg, not used. Examples -------- @@ -188,7 +193,7 @@ def lreshape(data, groups, dropna=True, label=None): return data._constructor(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): +def wide_to_long(df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): r""" Wide panel to long format. Less flexible but more user-friendly than melt. @@ -412,14 +417,14 @@ def wide_to_long(df, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): two 2.9 """ - def get_var_names(df, stub, sep, suffix): + def get_var_names(df, stub: str, sep: str, suffix: str) -> List[str]: regex = r"^{stub}{sep}{suffix}$".format( stub=re.escape(stub), sep=re.escape(sep), suffix=suffix ) pattern = re.compile(regex) return [col for col in df.columns if pattern.match(col)] - def melt_stub(df, stub, i, j, value_vars, sep: str): + def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( df, id_vars=i, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bc23d50c634d5..fdd31b3b7c022 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -6,6 +6,7 @@ import datetime from functools import partial import string +from typing import TYPE_CHECKING, Optional, Tuple, Union import warnings import numpy as np @@ -39,6 +40,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas import Categorical, Index, MultiIndex +from pandas._typing import FrameOrSeries import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com @@ -46,22 +48,25 @@ from pandas.core.internals import _transform_index, concatenate_block_managers from pandas.core.sorting import is_int64_overflow_possible +if TYPE_CHECKING: + from pandas import DataFrame, Series # noqa:F401 + @Substitution("\nleft : DataFrame") @Appender(_merge_doc, indents=0) def merge( left, right, - how="inner", + how: str = "inner", on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, - sort=False, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, suffixes=("_x", "_y"), - copy=True, - indicator=False, + copy: bool = True, + indicator: bool = False, validate=None, ): op = _MergeOperation( @@ -86,7 +91,9 @@ def merge( merge.__doc__ = _merge_doc % "\nleft : DataFrame" -def _groupby_and_merge(by, on, left, right, _merge_pieces, check_duplicates=True): +def _groupby_and_merge( + by, on, left, right: "DataFrame", _merge_pieces, check_duplicates: bool = True +): """ groupby & merge; we are always performing a left-by type operation @@ -172,7 +179,7 @@ def merge_ordered( right_by=None, fill_method=None, suffixes=("_x", "_y"), - how="outer", + how: str = "outer", ): """ Perform merge with optional filling/interpolation. @@ -298,15 +305,15 @@ def merge_asof( on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, + left_index: bool = False, + right_index: bool = False, by=None, left_by=None, right_by=None, suffixes=("_x", "_y"), tolerance=None, - allow_exact_matches=True, - direction="backward", + allow_exact_matches: bool = True, + direction: str = "backward", ): """ Perform an asof merge. This is similar to a left-join except that we @@ -533,33 +540,33 @@ def merge_asof( # TODO: only copy DataFrames when modification necessary class _MergeOperation: """ - Perform a database (SQL) merge operation between two DataFrame objects - using either columns as keys or their row indexes + Perform a database (SQL) merge operation between two DataFrame or Series + objects using either columns as keys or their row indexes """ _merge_type = "merge" def __init__( self, - left, - right, - how="inner", + left: Union["Series", "DataFrame"], + right: Union["Series", "DataFrame"], + how: str = "inner", on=None, left_on=None, right_on=None, axis=1, - left_index=False, - right_index=False, - sort=True, + left_index: bool = False, + right_index: bool = False, + sort: bool = True, suffixes=("_x", "_y"), - copy=True, - indicator=False, + copy: bool = True, + indicator: bool = False, validate=None, ): - left = validate_operand(left) - right = validate_operand(right) - self.left = self.orig_left = left - self.right = self.orig_right = right + _left = _validate_operand(left) + _right = _validate_operand(right) + self.left = self.orig_left = _left + self.right = self.orig_right = _right self.how = how self.axis = axis @@ -576,6 +583,7 @@ def __init__( self.indicator = indicator + self.indicator_name: Optional[str] if isinstance(self.indicator, str): self.indicator_name = self.indicator elif isinstance(self.indicator, bool): @@ -597,11 +605,11 @@ def __init__( ) # warn user when merging between different levels - if left.columns.nlevels != right.columns.nlevels: + if _left.columns.nlevels != _right.columns.nlevels: msg = ( "merging between different levels can give an unintended " "result ({left} levels on the left, {right} on the right)" - ).format(left=left.columns.nlevels, right=right.columns.nlevels) + ).format(left=_left.columns.nlevels, right=_right.columns.nlevels) warnings.warn(msg, UserWarning) self._validate_specification() @@ -658,7 +666,9 @@ def get_result(self): return result - def _indicator_pre_merge(self, left, right): + def _indicator_pre_merge( + self, left: "DataFrame", right: "DataFrame" + ) -> Tuple["DataFrame", "DataFrame"]: columns = left.columns.union(right.columns) @@ -878,7 +888,12 @@ def _get_join_info(self): return join_index, left_indexer, right_indexer def _create_join_index( - self, index, other_index, indexer, other_indexer, how="left" + self, + index: Index, + other_index: Index, + indexer, + other_indexer, + how: str = "left", ): """ Create a join index by rearranging one index to match another @@ -1263,7 +1278,9 @@ def _validate(self, validate: str): raise ValueError("Not a valid argument for validate") -def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs): +def _get_join_indexers( + left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs +): """ Parameters @@ -1283,11 +1300,13 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs) right_keys ), "left_key and right_keys must be the same length" - # bind `sort` arg. of _factorize_keys - fkeys = partial(_factorize_keys, sort=sort) - # get left & right join labels and num. of levels at each location - llab, rlab, shape = map(list, zip(*map(fkeys, left_keys, right_keys))) + mapped = ( + _factorize_keys(left_keys[n], right_keys[n], sort=sort) + for n in range(len(left_keys)) + ) + zipped = zip(*mapped) + llab, rlab, shape = [list(x) for x in zipped] # get flat i8 keys from label lists lkey, rkey = _get_join_keys(llab, rlab, shape, sort) @@ -1295,7 +1314,7 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs) # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = fkeys(lkey, rkey) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) @@ -1410,13 +1429,13 @@ def __init__( on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, + left_index: bool = False, + right_index: bool = False, axis=1, suffixes=("_x", "_y"), - copy=True, + copy: bool = True, fill_method=None, - how="outer", + how: str = "outer", ): self.fill_method = fill_method @@ -1471,12 +1490,12 @@ def get_result(self): return result -def _asof_function(direction): +def _asof_function(direction: str): name = "asof_join_{dir}".format(dir=direction) return getattr(libjoin, name, None) -def _asof_by_function(direction): +def _asof_by_function(direction: str): name = "asof_join_{dir}_on_X_by_Y".format(dir=direction) return getattr(libjoin, name, None) @@ -1508,19 +1527,19 @@ def __init__( on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, + left_index: bool = False, + right_index: bool = False, by=None, left_by=None, right_by=None, axis=1, suffixes=("_x", "_y"), - copy=True, + copy: bool = True, fill_method=None, - how="asof", + how: str = "asof", tolerance=None, - allow_exact_matches=True, - direction="backward", + allow_exact_matches: bool = True, + direction: str = "backward", ): self.by = by @@ -1757,13 +1776,15 @@ def flip(xs): return func(left_values, right_values, self.allow_exact_matches, tolerance) -def _get_multiindex_indexer(join_keys, index, sort): - - # bind `sort` argument - fkeys = partial(_factorize_keys, sort=sort) +def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): # left & right join labels and num. of levels at each location - rcodes, lcodes, shape = map(list, zip(*map(fkeys, index.levels, join_keys))) + mapped = ( + _factorize_keys(index.levels[n], join_keys[n], sort=sort) + for n in range(index.nlevels) + ) + zipped = zip(*mapped) + rcodes, lcodes, shape = [list(x) for x in zipped] if sort: rcodes = list(map(np.take, rcodes, index.codes)) else: @@ -1786,12 +1807,12 @@ def _get_multiindex_indexer(join_keys, index, sort): lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort) # factorize keys to a dense i8 space - lkey, rkey, count = fkeys(lkey, rkey) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) return libjoin.left_outer_join(lkey, rkey, count, sort=sort) -def _get_single_indexer(join_key, index, sort=False): +def _get_single_indexer(join_key, index, sort: bool = False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = libjoin.left_outer_join( @@ -1801,7 +1822,7 @@ def _get_single_indexer(join_key, index, sort=False): return left_indexer, right_indexer -def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): +def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): if len(join_keys) > 1: if not ( (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) @@ -1915,7 +1936,7 @@ def _factorize_keys(lk, rk, sort=True): return llab, rlab, count -def _sort_labels(uniques, left, right): +def _sort_labels(uniques: np.ndarray, left, right): if not isinstance(uniques, np.ndarray): # tuplesafe uniques = Index(uniques).values @@ -1930,7 +1951,7 @@ def _sort_labels(uniques, left, right): return new_left, new_right -def _get_join_keys(llab, rlab, shape, sort): +def _get_join_keys(llab, rlab, shape, sort: bool): # how many levels can be done without overflow pred = lambda i: not is_int64_overflow_possible(shape[:i]) @@ -1970,7 +1991,7 @@ def _any(x) -> bool: return x is not None and com.any_not_none(*x) -def validate_operand(obj): +def _validate_operand(obj: FrameOrSeries) -> "DataFrame": if isinstance(obj, ABCDataFrame): return obj elif isinstance(obj, ABCSeries): @@ -1985,7 +2006,7 @@ def validate_operand(obj): ) -def _items_overlap_with_suffix(left, lsuffix, right, rsuffix): +def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): """ If two indices overlap, add suffixes to overlapping entries. diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 404292fe4d539..27d6a28a33cc6 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING, Callable, Dict, Tuple, Union + import numpy as np from pandas.util._decorators import Appender, Substitution @@ -9,11 +11,14 @@ import pandas.core.common as com from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper -from pandas.core.index import Index, MultiIndex, _get_objs_combined_axis +from pandas.core.index import Index, MultiIndex, get_objs_combined_axis from pandas.core.reshape.concat import concat from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series +if TYPE_CHECKING: + from pandas import DataFrame + # Note: We need to make sure `frame` is imported before `pivot`, otherwise # _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency @@ -180,14 +185,14 @@ def pivot_table( def _add_margins( - table, + table: Union["Series", "DataFrame"], data, values, rows, cols, aggfunc, observed=None, - margins_name="All", + margins_name: str = "All", fill_value=None, ): if not isinstance(margins_name, str): @@ -200,12 +205,13 @@ def _add_margins( grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) - # could be passed a Series object with no 'columns' - if hasattr(table, "columns"): + if table.ndim == 2: + # i.e. DataFramae for level in table.columns.names[1:]: if margins_name in table.columns.get_level_values(level): raise ValueError(msg) + key: Union[str, Tuple[str, ...]] if len(rows) > 1: key = (margins_name,) + ("",) * (len(rows) - 1) else: @@ -216,7 +222,7 @@ def _add_margins( # one column in the data. Compute grand margin and return it. return table.append(Series({key: grand_margin[margins_name]})) - if values: + elif values: marginal_result_set = _generate_marginal_results( table, data, @@ -232,12 +238,15 @@ def _add_margins( return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: + # no values, and table is a DataFrame + assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( table, data, rows, cols, aggfunc, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set + row_margin = row_margin.reindex(result.columns, fill_value=fill_value) # populate grand margin for k in margin_keys: @@ -252,9 +261,12 @@ def _add_margins( row_names = result.index.names try: + # check the result column and leave floats for dtype in set(result.dtypes): cols = result.select_dtypes([dtype]).columns - margin_dummy[cols] = margin_dummy[cols].astype(dtype) + margin_dummy[cols] = margin_dummy[cols].apply( + maybe_downcast_to_dtype, args=(dtype,) + ) result = result.append(margin_dummy) except TypeError: @@ -266,7 +278,7 @@ def _add_margins( return result -def _compute_grand_margin(data, values, aggfunc, margins_name="All"): +def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): if values: grand_margin = {} @@ -289,7 +301,15 @@ def _compute_grand_margin(data, values, aggfunc, margins_name="All"): def _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, grand_margin, margins_name="All" + table, + data, + values, + rows, + cols, + aggfunc, + observed, + grand_margin, + margins_name: str = "All", ): if len(cols) > 0: # need to "interleave" the margins @@ -353,7 +373,7 @@ def _all_key(key): def _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, observed, margins_name="All" + table: "DataFrame", data, rows, cols, aggfunc, observed, margins_name: str = "All" ): if len(cols) > 0: # need to "interleave" the margins @@ -406,7 +426,7 @@ def _convert_by(by): @Substitution("\ndata : DataFrame") @Appender(_shared_docs["pivot"], indents=1) -def pivot(data, index=None, columns=None, values=None): +def pivot(data: "DataFrame", index=None, columns=None, values=None): if values is None: cols = [columns] if index is None else [index, columns] append = index is None @@ -436,8 +456,8 @@ def crosstab( colnames=None, aggfunc=None, margins=False, - margins_name="All", - dropna=True, + margins_name: str = "All", + dropna: bool = True, normalize=False, ): """ @@ -541,9 +561,12 @@ def crosstab( rownames = _get_names(index, rownames, prefix="row") colnames = _get_names(columns, colnames, prefix="col") - common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False) + common_idx = None + pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] + if pass_objs: + common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) - data = {} + data: Dict = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) @@ -570,7 +593,7 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, - **kwargs + **kwargs, ) # Post-process @@ -582,7 +605,7 @@ def crosstab( return table -def _normalize(table, normalize, margins, margins_name="All"): +def _normalize(table, normalize, margins: bool, margins_name="All"): if not isinstance(normalize, (bool, str)): axis_subs = {0: "index", 1: "columns"} @@ -594,7 +617,7 @@ def _normalize(table, normalize, margins, margins_name="All"): if margins is False: # Actual Normalizations - normalizers = { + normalizers: Dict[Union[bool, str], Callable] = { "all": lambda x: x / x.sum(axis=1).sum(axis=0), "columns": lambda x: x / x.sum(), "index": lambda x: x.div(x.sum(axis=1), axis=0), @@ -665,7 +688,7 @@ def _normalize(table, normalize, margins, margins_name="All"): return table -def _get_names(arrs, names, prefix="row"): +def _get_names(arrs, names, prefix: str = "row"): if names is None: names = [] for i, arr in enumerate(arrs): diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 073bb4707f890..bfaa49dd576dc 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -496,7 +496,7 @@ def _convert_bin_to_datelike_type(bins, dtype): def _format_labels( - bins, precision, right: bool = True, include_lowest: bool = False, dtype=None + bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None ): """ based on the dtype, return our labels """ @@ -565,7 +565,7 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original): return fac, bins -def _round_frac(x, precision): +def _round_frac(x, precision: int): """ Round the fractional part of the given number """ @@ -580,7 +580,7 @@ def _round_frac(x, precision): return np.around(x, digits) -def _infer_precision(base_precision, bins): +def _infer_precision(base_precision: int, bins) -> int: """Infer an appropriate precision for _round_frac """ for precision in range(base_precision, 20): diff --git a/pandas/core/series.py b/pandas/core/series.py index 15f405e244d0f..1843ffb1afaec 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1,7 +1,6 @@ """ Data structure for 1-dimensional cross-sectional and time series data """ -from collections import OrderedDict from io import StringIO from shutil import get_terminal_size from textwrap import dedent @@ -13,7 +12,6 @@ from pandas._config import get_option from pandas._libs import index as libindex, lib, reshape, tslibs -from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, deprecate from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -98,23 +96,6 @@ ) -# see gh-16971 -def remove_na(arr): - """ - Remove null values from array like structure. - - .. deprecated:: 0.21.0 - Use s[s.notnull()] instead. - """ - - warnings.warn( - "remove_na is deprecated and is a private function. Do not use.", - FutureWarning, - stacklevel=2, - ) - return remove_na_arraylike(arr) - - def _coerce_method(converter): """ Install the scalar coercion methods. @@ -172,23 +153,13 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Copy input data. """ - _metadata = [] # type: List[str] + _metadata: List[str] = [] _accessors = {"dt", "cat", "str", "sparse"} _deprecations = ( base.IndexOpsMixin._deprecations | generic.NDFrame._deprecations | frozenset( - [ - "asobject", - "compress", - "valid", - "ftype", - "real", - "imag", - "put", - "ptp", - "nonzero", - ] + ["compress", "valid", "ftype", "real", "imag", "put", "ptp", "nonzero"] ) ) @@ -196,7 +167,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): hasnans = property( base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ ) - _data = None # type: SingleBlockManager + _data: SingleBlockManager # ---------------------------------------------------------------------- # Constructors @@ -364,41 +335,8 @@ def _init_dict(self, data, index=None, dtype=None): # Now we just make sure the order is respected, if any if data and index is not None: s = s.reindex(index, copy=False) - elif not PY36 and not isinstance(data, OrderedDict) and data: - # Need the `and data` to avoid sorting Series(None, index=[...]) - # since that isn't really dict-like - try: - s = s.sort_index() - except TypeError: - pass return s._data, s.index - @classmethod - def from_array( - cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False - ): - """ - Construct Series from array. - - .. deprecated:: 0.23.0 - Use pd.Series(..) constructor instead. - - Returns - ------- - Series - Constructed Series. - """ - warnings.warn( - "'from_array' is deprecated and will be removed in a " - "future version. Please use the pd.Series(..) " - "constructor instead.", - FutureWarning, - stacklevel=2, - ) - return cls( - arr, index=index, name=name, dtype=dtype, copy=copy, fastpath=fastpath - ) - # ---------------------------------------------------------------------- @property @@ -588,24 +526,6 @@ def get_values(self): def _internal_get_values(self): return self._data.get_values() - @property - def asobject(self): - """ - Return object Series which contains boxed values. - - .. deprecated:: 0.23.0 - - Use ``astype(object)`` instead. - - *this is an internal non-public method* - """ - warnings.warn( - "'asobject' is deprecated. Use 'astype(object)' instead", - FutureWarning, - stacklevel=2, - ) - return self.astype(object).values - # ops def ravel(self, order="C"): """ @@ -712,7 +632,7 @@ def put(self, *args, **kwargs): ) self._values.put(*args, **kwargs) - def __len__(self): + def __len__(self) -> int: """ Return the length of the Series. """ @@ -844,9 +764,10 @@ def __array_ufunc__( inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) + + name: Optional[Hashable] if len(set(names)) == 1: - # we require names to be hashable, right? - name = names[0] # type: Any + name = names[0] else: name = None @@ -1017,7 +938,7 @@ def _unpickle_series_compat(self, state): self.name = name else: - raise Exception("cannot unpickle legacy formats -> [%s]" % state) + raise Exception(f"cannot unpickle legacy formats -> [{state}]") # indexers @property @@ -1312,7 +1233,7 @@ def _set_labels(self, key, value): indexer = self.index.get_indexer(key) mask = indexer == -1 if mask.any(): - raise ValueError("%s not contained in the index" % str(key[mask])) + raise ValueError(f"{key[mask]} not contained in the index") self._set_values(indexer, value) def _set_values(self, key, value): @@ -2096,7 +2017,7 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): will be NA. *args, **kwargs Additional arguments and keywords have no effect but might be - accepted for compatability with NumPy. + accepted for compatibility with NumPy. Returns ------- @@ -2600,7 +2521,7 @@ def dot(self, other): rvals = np.asarray(other) if lvals.shape[0] != rvals.shape[0]: raise Exception( - "Dot product shape mismatch, %s vs %s" % (lvals.shape, rvals.shape) + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" ) if isinstance(other, ABCDataFrame): @@ -2612,7 +2533,7 @@ def dot(self, other): elif isinstance(rvals, np.ndarray): return np.dot(lvals, rvals) else: # pragma: no cover - raise TypeError("unsupported type: %s" % type(other)) + raise TypeError(f"unsupported type: {type(other)}") def __matmul__(self, other): """ @@ -3092,8 +3013,7 @@ def _try_kind_sort(arr): if is_list_like(ascending): if len(ascending) != 1: raise ValueError( - "Length of ascending (%d) must be 1 " - "for Series" % (len(ascending)) + f"Length of ascending ({len(ascending)}) must be 1 for Series" ) ascending = ascending[0] @@ -3797,7 +3717,7 @@ def _gotitem(self, key, ndim, subset=None): see_also=_agg_see_also_doc, examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", - **_shared_doc_kwargs + **_shared_doc_kwargs, ) @Appender(generic._shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): @@ -4021,7 +3941,7 @@ def _reduce( skipna=skipna, numeric_only=numeric_only, filter_type=filter_type, - **kwds + **kwds, ) def _reindex_indexer(self, new_index, indexer, copy): @@ -4258,7 +4178,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs + **kwargs, ): return super().fillna( value=value, @@ -4267,7 +4187,7 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs + **kwargs, ) @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) @@ -4679,26 +4599,6 @@ def dropna(self, axis=0, inplace=False, how=None): else: return self.copy() - def valid(self, inplace=False, **kwargs): - """ - Return Series without null values. - - .. deprecated:: 0.23.0 - Use :meth:`Series.dropna` instead. - - Returns - ------- - Series - Series without null values. - """ - warnings.warn( - "Method .valid will be removed in a future version. " - "Use .dropna instead.", - FutureWarning, - stacklevel=2, - ) - return self.dropna(inplace=inplace, **kwargs) - # ---------------------------------------------------------------------- # Time series-oriented methods diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7194d1cf08e4a..137c37f938dfa 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import Dict, List +from typing import TYPE_CHECKING, Any, Callable, Dict, List import warnings import numpy as np @@ -15,10 +15,14 @@ ensure_object, is_bool_dtype, is_categorical_dtype, + is_extension_array_dtype, is_integer, + is_integer_dtype, is_list_like, + is_object_dtype, is_re, is_scalar, + is_string_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -28,9 +32,14 @@ ) from pandas.core.dtypes.missing import isna +from pandas._typing import ArrayLike, Dtype from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin import pandas.core.common as com +from pandas.core.construction import extract_array + +if TYPE_CHECKING: + from pandas.arrays import StringArray _cpython_optimized_encoders = ( "utf-8", @@ -43,7 +52,7 @@ ) _cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") -_shared_docs = dict() # type: Dict[str, str] +_shared_docs: Dict[str, str] = dict() def cat_core(list_of_columns: List, sep: str): @@ -109,10 +118,79 @@ def cat_safe(list_of_columns: List, sep: str): def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA - return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) + if is_extension_array_dtype(arr.dtype): + # just StringDtype + arr = extract_array(arr) + return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) + return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) + + +def _map_stringarray( + func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype +) -> ArrayLike: + """ + Map a callable over valid elements of a StringArrray. + + Parameters + ---------- + func : Callable[[str], Any] + Apply to each valid element. + arr : StringArray + na_value : Any + The value to use for missing values. By default, this is + the original value (NA). + dtype : Dtype + The result dtype to use. Specifying this aviods an intermediate + object-dtype allocation. + + Returns + ------- + ArrayLike + An ExtensionArray for integer or string dtypes, otherwise + an ndarray. + + """ + from pandas.arrays import IntegerArray, StringArray + + mask = isna(arr) + + assert isinstance(arr, StringArray) + arr = np.asarray(arr) + + if is_integer_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + func, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype("int64"), + ) + if not na_value_is_na: + mask[:] = False -def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): + return IntegerArray(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, func, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + # TODO: BooleanArray + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, func, mask.view("uint8")) + + +def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object): if not len(arr): return np.ndarray(0, dtype=dtype) @@ -143,7 +221,7 @@ def g(x): except (TypeError, AttributeError): return na_value - return _map(g, arr, dtype=dtype) + return _map_object(g, arr, dtype=dtype) if na_value is not np.nan: np.putmask(result, mask, na_value) if result.dtype == object: @@ -634,7 +712,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): raise ValueError("Cannot use a callable replacement when regex=False") f = lambda x: x.replace(pat, repl, n) - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_repeat(arr, repeats): @@ -685,7 +763,7 @@ def scalar_rep(x): except TypeError: return str.__mul__(x, repeats) - return _na_map(scalar_rep, arr) + return _na_map(scalar_rep, arr, dtype=str) else: def rep(x, r): @@ -1150,7 +1228,7 @@ def str_join(arr, sep): 4 NaN dtype: object """ - return _na_map(sep.join, arr) + return _na_map(sep.join, arr, dtype=str) def str_findall(arr, pat, flags=0): @@ -1381,7 +1459,7 @@ def str_pad(arr, width, side="left", fillchar=" "): else: # pragma: no cover raise ValueError("Invalid side") - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_split(arr, pat=None, n=None): @@ -1487,7 +1565,7 @@ def str_slice(arr, start=None, stop=None, step=None): """ obj = slice(start, stop, step) f = lambda x: x[obj] - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_slice_replace(arr, start=None, stop=None, repl=None): @@ -1578,7 +1656,7 @@ def f(x): y += x[local_stop:] return y - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_strip(arr, to_strip=None, side="both"): @@ -1603,7 +1681,7 @@ def str_strip(arr, to_strip=None, side="both"): f = lambda x: x.rstrip(to_strip) else: # pragma: no cover raise ValueError("Invalid side") - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_wrap(arr, width, **kwargs): @@ -1667,7 +1745,7 @@ def str_wrap(arr, width, **kwargs): tw = textwrap.TextWrapper(**kwargs) - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr) + return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) def str_translate(arr, table): @@ -1687,7 +1765,7 @@ def str_translate(arr, table): ------- Series or Index """ - return _na_map(lambda x: x.translate(table), arr) + return _na_map(lambda x: x.translate(table), arr, dtype=str) def str_get(arr, i): @@ -1875,7 +1953,7 @@ def _noarg_wrapper( docstring=None, forbidden_types=["bytes"], returns_string=True, - **kargs + **kargs, ): @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): @@ -1898,7 +1976,7 @@ def _pat_wrapper( name=None, forbidden_types=["bytes"], returns_string=True, - **kwargs + **kwargs, ): @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): @@ -3025,7 +3103,7 @@ def normalize(self, form): import unicodedata f = lambda x: unicodedata.normalize(form, x) - result = _na_map(f, self._parent) + result = _na_map(f, self._parent, dtype=str) return self._wrap_result(result) _shared_docs[ @@ -3206,7 +3284,7 @@ def rindex(self, sub, start=0, end=None): """ # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args = {} # type: Dict[str, Dict[str, str]] + _doc_args: Dict[str, Dict[str, str]] = {} _doc_args["lower"] = dict(type="lowercase", method="lower", version="") _doc_args["upper"] = dict(type="uppercase", method="upper", version="") _doc_args["title"] = dict(type="titlecase", method="title", version="") @@ -3223,31 +3301,37 @@ def rindex(self, sub, start=0, end=None): lambda x: x.lower(), name="lower", docstring=_shared_docs["casemethods"] % _doc_args["lower"], + dtype=str, ) upper = _noarg_wrapper( lambda x: x.upper(), name="upper", docstring=_shared_docs["casemethods"] % _doc_args["upper"], + dtype=str, ) title = _noarg_wrapper( lambda x: x.title(), name="title", docstring=_shared_docs["casemethods"] % _doc_args["title"], + dtype=str, ) capitalize = _noarg_wrapper( lambda x: x.capitalize(), name="capitalize", docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], + dtype=str, ) swapcase = _noarg_wrapper( lambda x: x.swapcase(), name="swapcase", docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], + dtype=str, ) casefold = _noarg_wrapper( lambda x: x.casefold(), name="casefold", docstring=_shared_docs["casemethods"] % _doc_args["casefold"], + dtype=str, ) _shared_docs[ @@ -3401,59 +3485,69 @@ def rindex(self, sub, start=0, end=None): _doc_args["istitle"] = dict(type="titlecase", method="istitle") _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") + # force _noarg_wrapper return type with dtype=bool (GH 29624) isalnum = _noarg_wrapper( lambda x: x.isalnum(), name="isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], returns_string=False, + dtype=bool, ) isalpha = _noarg_wrapper( lambda x: x.isalpha(), name="isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], returns_string=False, + dtype=bool, ) isdigit = _noarg_wrapper( lambda x: x.isdigit(), name="isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], returns_string=False, + dtype=bool, ) isspace = _noarg_wrapper( lambda x: x.isspace(), name="isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"], returns_string=False, + dtype=bool, ) islower = _noarg_wrapper( lambda x: x.islower(), name="islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"], returns_string=False, + dtype=bool, ) isupper = _noarg_wrapper( lambda x: x.isupper(), name="isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"], returns_string=False, + dtype=bool, ) istitle = _noarg_wrapper( lambda x: x.istitle(), name="istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"], returns_string=False, + dtype=bool, ) isnumeric = _noarg_wrapper( lambda x: x.isnumeric(), name="isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], returns_string=False, + dtype=bool, ) isdecimal = _noarg_wrapper( lambda x: x.isdecimal(), name="isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], returns_string=False, + dtype=bool, ) @classmethod diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index bb8d15896b727..453d1cca2e085 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -857,7 +857,7 @@ def f(value): # we require at least Ymd required = ["year", "month", "day"] - req = sorted(list(set(required) - set(unit_rev.keys()))) + req = sorted(set(required) - set(unit_rev.keys())) if len(req): raise ValueError( "to assemble mappings requires at least that " @@ -866,7 +866,7 @@ def f(value): ) # keys we don't recognize - excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values()))) + excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) if len(excess): raise ValueError( "extra keys have been passed " diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 23c370638b572..fa3582755a202 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -133,7 +133,7 @@ def hash_pandas_object( h = Series(h, index=obj.index, dtype="uint64", copy=False) else: - raise TypeError("Unexpected type for hashing %s" % type(obj)) + raise TypeError(f"Unexpected type for hashing {type(obj)}") return h diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 3fd567f97edae..453fd12495543 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -1,5 +1,6 @@ """Common utility functions for rolling operations""" from collections import defaultdict +from typing import Callable, Optional import warnings import numpy as np @@ -62,12 +63,20 @@ def __init__(self, obj, *args, **kwargs): cov = _dispatch("cov", other=None, pairwise=None) def _apply( - self, func, name=None, window=None, center=None, check_minp=None, **kwargs + self, + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, + name: Optional[str] = None, + **kwargs, ): """ Dispatch to apply; we are stripping all of the _apply kwargs and performing the original function call on the grouped object. """ + kwargs.pop("floor", None) # TODO: can we de-duplicate with _dispatch? def f(x, name=name, *args): @@ -267,6 +276,44 @@ def _use_window(minp, window): return minp +def calculate_min_periods( + window: int, + min_periods: Optional[int], + num_values: int, + required_min_periods: int, + floor: int, +) -> int: + """ + Calculates final minimum periods value for rolling aggregations. + + Parameters + ---------- + window : passed window value + min_periods : passed min periods value + num_values : total number of values + required_min_periods : required min periods per aggregation function + floor : required min periods per aggregation function + + Returns + ------- + min_periods : int + """ + if min_periods is None: + min_periods = window + else: + min_periods = max(required_min_periods, min_periods) + if min_periods > window: + raise ValueError( + "min_periods {min_periods} must be <= " + "window {window}".format(min_periods=min_periods, window=window) + ) + elif min_periods > num_values: + min_periods = num_values + 1 + elif min_periods < 0: + raise ValueError("min_periods must be >= 0") + return max(min_periods, floor) + + def _zsqrt(x): with np.errstate(all="ignore"): result = np.sqrt(x) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 89c25c07b0dbf..c9837afd96356 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -2,7 +2,7 @@ import numpy as np -import pandas._libs.window as libwindow +import pandas._libs.window.aggregations as window_aggregations from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution @@ -228,11 +228,11 @@ def _apply(self, func, **kwargs): # if we have a string function name, wrap it if isinstance(func, str): - cfunc = getattr(libwindow, func, None) + cfunc = getattr(window_aggregations, func, None) if cfunc is None: raise ValueError( "we do not support this function " - "in libwindow.{func}".format(func=func) + "in window_aggregations.{func}".format(func=func) ) def func(arg): @@ -284,7 +284,7 @@ def var(self, bias=False, *args, **kwargs): nv.validate_window_func("var", args, kwargs) def f(arg): - return libwindow.ewmcov( + return window_aggregations.ewmcov( arg, arg, self.com, @@ -328,7 +328,7 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): def _get_cov(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) - cov = libwindow.ewmcov( + cov = window_aggregations.ewmcov( X._prep_values(), Y._prep_values(), self.com, @@ -375,7 +375,7 @@ def _get_corr(X, Y): Y = self._shallow_copy(Y) def _cov(x, y): - return libwindow.ewmcov( + return window_aggregations.ewmcov( x, y, self.com, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index caf2f9e1c9dd3..2f37ba9b8f725 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -3,13 +3,15 @@ similar to how we have a Groupby object. """ from datetime import timedelta +from functools import partial from textwrap import dedent from typing import Callable, Dict, List, Optional, Set, Tuple, Union import warnings import numpy as np -import pandas._libs.window as libwindow +import pandas._libs.window.aggregations as window_aggregations +import pandas._libs.window.indexers as window_indexers from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -43,15 +45,15 @@ _doc_template, _flex_binary_moment, _offset, - _require_min_periods, _shared_docs, _use_window, _zsqrt, + calculate_min_periods, ) class _Window(PandasObject, ShallowMixin, SelectionMixin): - _attributes = [ + _attributes: List[str] = [ "window", "min_periods", "center", @@ -59,8 +61,8 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): "axis", "on", "closed", - ] # type: List[str] - exclusions = set() # type: Set[str] + ] + exclusions: Set[str] = set() def __init__( self, @@ -72,7 +74,7 @@ def __init__( axis: Axis = 0, on: Optional[Union[str, Index]] = None, closed: Optional[str] = None, - **kwargs + **kwargs, ): self.__dict__.update(kwargs) @@ -366,40 +368,56 @@ def _center_window(self, result, window) -> np.ndarray: result = np.copy(result[tuple(lead_indexer)]) return result - def _get_roll_func( - self, cfunc: Callable, check_minp: Callable, index: np.ndarray, **kwargs - ) -> Callable: + def _get_roll_func(self, func_name: str) -> Callable: """ Wrap rolling function to check values passed. Parameters ---------- - cfunc : callable + func_name : str Cython function used to calculate rolling statistics - check_minp : callable - function to check minimum period parameter - index : ndarray - used for variable window Returns ------- func : callable """ + window_func = getattr(window_aggregations, func_name, None) + if window_func is None: + raise ValueError( + "we do not support this function " + "in window_aggregations.{func_name}".format(func_name=func_name) + ) + return window_func - def func(arg, window, min_periods=None, closed=None): - minp = check_minp(min_periods, window) - return cfunc(arg, window, minp, index, closed, **kwargs) + def _get_cython_func_type(self, func): + """ + Return a variable or fixed cython function type. - return func + Variable algorithms do not use window while fixed do. + """ + if self.is_freq_type: + return self._get_roll_func("{}_variable".format(func)) + return partial( + self._get_roll_func("{}_fixed".format(func)), win=self._get_window() + ) + + def _get_window_indexer(self): + """ + Return an indexer class that will compute the window start and end bounds + """ + if self.is_freq_type: + return window_indexers.VariableWindowIndexer + return window_indexers.FixedWindowIndexer def _apply( self, - func: Union[str, Callable], + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, name: Optional[str] = None, - window: Optional[Union[int, str]] = None, - center: Optional[bool] = None, - check_minp: Optional[Callable] = None, - **kwargs + **kwargs, ): """ Rolling statistical measure using supplied function. @@ -408,13 +426,13 @@ def _apply( Parameters ---------- - func : str/callable to apply - name : str, optional - name of this function - window : int/str, default to _get_window() - window length or offset - center : bool, default to self.center - check_minp : function, default to _use_window + func : callable function to apply + center : bool + require_min_periods : int + floor: int + is_weighted + name: str, + compatibility with groupby.rolling **kwargs additional arguments for rolling function and window function @@ -422,23 +440,16 @@ def _apply( ------- y : type of input """ - - if center is None: - center = self.center - - if check_minp is None: - check_minp = _use_window - - if window is None: - win_type = self._get_win_type(kwargs) - window = self._get_window(win_type=win_type) + win_type = self._get_win_type(kwargs) + window = self._get_window(win_type=win_type) blocks, obj = self._create_blocks() block_list = list(blocks) index_as_array = self._get_index() + window_indexer = self._get_window_indexer() results = [] - exclude = [] # type: List[Scalar] + exclude: List[Scalar] = [] for i, b in enumerate(blocks): try: values = self._prep_values(b.values) @@ -455,36 +466,27 @@ def _apply( results.append(values.copy()) continue - # if we have a string function name, wrap it - if isinstance(func, str): - cfunc = getattr(libwindow, func, None) - if cfunc is None: - raise ValueError( - "we do not support this function " - "in libwindow.{func}".format(func=func) - ) - - func = self._get_roll_func(cfunc, check_minp, index_as_array, **kwargs) - # calculation function - if center: - offset = _offset(window, center) - additional_nans = np.array([np.NaN] * offset) + offset = _offset(window, center) if center else 0 + additional_nans = np.array([np.nan] * offset) + + if not is_weighted: def calc(x): - return func( - np.concatenate((x, additional_nans)), - window, - min_periods=self.min_periods, - closed=self.closed, + x = np.concatenate((x, additional_nans)) + min_periods = calculate_min_periods( + window, self.min_periods, len(x), require_min_periods, floor ) + start, end = window_indexer( + x, window, self.closed, index_as_array + ).get_window_bounds() + return func(x, start, end, min_periods) else: def calc(x): - return func( - x, window, min_periods=self.min_periods, closed=self.closed - ) + x = np.concatenate((x, additional_nans)) + return func(x, window, self.min_periods) with np.errstate(all="ignore"): if values.ndim > 1: @@ -995,8 +997,8 @@ def _get_window( # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) - def _get_roll_func( - self, cfunc: Callable, check_minp: Callable, index: np.ndarray, **kwargs + def _get_weighted_roll_func( + self, cfunc: Callable, check_minp: Callable, **kwargs ) -> Callable: def func(arg, window, min_periods=None, closed=None): minp = check_minp(min_periods, len(window)) @@ -1070,25 +1072,38 @@ def aggregate(self, func, *args, **kwargs): @Appender(_shared_docs["sum"]) def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) - return self._apply("roll_weighted_sum", **kwargs) + window_func = self._get_roll_func("roll_weighted_sum") + window_func = self._get_weighted_roll_func(window_func, _use_window) + return self._apply( + window_func, center=self.center, is_weighted=True, name="sum", **kwargs + ) @Substitution(name="window") @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) - return self._apply("roll_weighted_mean", **kwargs) + window_func = self._get_roll_func("roll_weighted_mean") + window_func = self._get_weighted_roll_func(window_func, _use_window) + return self._apply( + window_func, center=self.center, is_weighted=True, name="mean", **kwargs + ) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) - return self._apply("roll_weighted_var", ddof=ddof, **kwargs) + window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof) + window_func = self._get_weighted_roll_func(window_func, _use_window) + kwargs.pop("name", None) + return self._apply( + window_func, center=self.center, is_weighted=True, name="var", **kwargs + ) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - return _zsqrt(self.var(ddof=ddof, **kwargs)) + return _zsqrt(self.var(ddof=ddof, name="std", **kwargs)) class _Rolling(_Window): @@ -1203,9 +1218,9 @@ def apply(self, func, raw=None, args=(), kwargs={}): from pandas import Series kwargs.pop("_level", None) + kwargs.pop("floor", None) window = self._get_window() offset = _offset(window, self.center) - index_as_array = self._get_index() # TODO: default is for backward compat # change to False in the future @@ -1221,28 +1236,31 @@ def apply(self, func, raw=None, args=(), kwargs={}): ) raw = True - def f(arg, window, min_periods, closed): - minp = _use_window(min_periods, window) + window_func = partial( + self._get_cython_func_type("roll_generic"), + args=args, + kwargs=kwargs, + raw=raw, + offset=offset, + func=func, + ) + + def apply_func(values, begin, end, min_periods, raw=raw): if not raw: - arg = Series(arg, index=self.obj.index) - return libwindow.roll_generic( - arg, - window, - minp, - index_as_array, - closed, - offset, - func, - raw, - args, - kwargs, - ) + values = Series(values, index=self.obj.index) + return window_func(values, begin, end, min_periods) - return self._apply(f, func, args=args, kwargs=kwargs, center=False, raw=raw) + # TODO: Why do we always pass center=False? + # name=func for WindowGroupByMixin._apply + return self._apply(apply_func, center=False, floor=0, name=func) def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) - return self._apply("roll_sum", "sum", **kwargs) + window_func = self._get_cython_func_type("roll_sum") + kwargs.pop("floor", None) + return self._apply( + window_func, center=self.center, floor=0, name="sum", **kwargs + ) _shared_docs["max"] = dedent( """ @@ -1257,7 +1275,8 @@ def sum(self, *args, **kwargs): def max(self, *args, **kwargs): nv.validate_window_func("max", args, kwargs) - return self._apply("roll_max", "max", **kwargs) + window_func = self._get_cython_func_type("roll_max") + return self._apply(window_func, center=self.center, name="max", **kwargs) _shared_docs["min"] = dedent( """ @@ -1298,11 +1317,13 @@ def max(self, *args, **kwargs): def min(self, *args, **kwargs): nv.validate_window_func("min", args, kwargs) - return self._apply("roll_min", "min", **kwargs) + window_func = self._get_cython_func_type("roll_min") + return self._apply(window_func, center=self.center, name="min", **kwargs) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) - return self._apply("roll_mean", "mean", **kwargs) + window_func = self._get_cython_func_type("roll_mean") + return self._apply(window_func, center=self.center, name="mean", **kwargs) _shared_docs["median"] = dedent( """ @@ -1342,27 +1363,40 @@ def mean(self, *args, **kwargs): ) def median(self, **kwargs): - return self._apply("roll_median_c", "median", **kwargs) + window_func = self._get_roll_func("roll_median_c") + window_func = partial(window_func, win=self._get_window()) + return self._apply(window_func, center=self.center, name="median", **kwargs) def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - window = self._get_window() - index_as_array = self._get_index() + kwargs.pop("require_min_periods", None) + window_func = self._get_cython_func_type("roll_var") - def f(arg, *args, **kwargs): - minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt( - libwindow.roll_var(arg, window, minp, index_as_array, self.closed, ddof) - ) + def zsqrt_func(values, begin, end, min_periods): + return _zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) + # ddof passed again for compat with groupby.rolling return self._apply( - f, "std", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + zsqrt_func, + center=self.center, + require_min_periods=1, + name="std", + ddof=ddof, + **kwargs, ) def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) + kwargs.pop("require_min_periods", None) + window_func = partial(self._get_cython_func_type("roll_var"), ddof=ddof) + # ddof passed again for compat with groupby.rolling return self._apply( - "roll_var", "var", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + window_func, + center=self.center, + require_min_periods=1, + name="var", + ddof=ddof, + **kwargs, ) _shared_docs[ @@ -1377,8 +1411,14 @@ def var(self, ddof=1, *args, **kwargs): """ def skew(self, **kwargs): + window_func = self._get_cython_func_type("roll_skew") + kwargs.pop("require_min_periods", None) return self._apply( - "roll_skew", "skew", check_minp=_require_min_periods(3), **kwargs + window_func, + center=self.center, + require_min_periods=3, + name="skew", + **kwargs, ) _shared_docs["kurt"] = dedent( @@ -1414,8 +1454,14 @@ def skew(self, **kwargs): ) def kurt(self, **kwargs): + window_func = self._get_cython_func_type("roll_kurt") + kwargs.pop("require_min_periods", None) return self._apply( - "roll_kurt", "kurt", check_minp=_require_min_periods(4), **kwargs + window_func, + center=self.center, + require_min_periods=4, + name="kurt", + **kwargs, ) _shared_docs["quantile"] = dedent( @@ -1475,31 +1521,22 @@ def kurt(self, **kwargs): ) def quantile(self, quantile, interpolation="linear", **kwargs): - window = self._get_window() - index_as_array = self._get_index() - - def f(arg, *args, **kwargs): - minp = _use_window(self.min_periods, window) - if quantile == 1.0: - return libwindow.roll_max( - arg, window, minp, index_as_array, self.closed - ) - elif quantile == 0.0: - return libwindow.roll_min( - arg, window, minp, index_as_array, self.closed - ) - else: - return libwindow.roll_quantile( - arg, - window, - minp, - index_as_array, - self.closed, - quantile, - interpolation, - ) + if quantile == 1.0: + window_func = self._get_cython_func_type("roll_max") + elif quantile == 0.0: + window_func = self._get_cython_func_type("roll_min") + else: + window_func = partial( + self._get_roll_func("roll_quantile"), + win=self._get_window(), + quantile=quantile, + interpolation=interpolation, + ) - return self._apply(f, "quantile", quantile=quantile, **kwargs) + # Pass through for groupby.rolling + kwargs["quantile"] = quantile + kwargs["interpolation"] = interpolation + return self._apply(window_func, center=self.center, name="quantile", **kwargs) _shared_docs[ "cov" @@ -1854,7 +1891,8 @@ def count(self): # different impl for freq counting if self.is_freq_type: - return self._apply("roll_count", "count") + window_func = self._get_roll_func("roll_count") + return self._apply(window_func, center=self.center, name="count") return super().count() diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 63dd40a229dfc..4f690a57893d1 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -87,7 +87,7 @@ class PyperclipException(RuntimeError): class PyperclipWindowsException(PyperclipException): def __init__(self, message): - message += " (%s)" % ctypes.WinError() + message += f" ({ctypes.WinError()})" super().__init__(message) @@ -95,8 +95,8 @@ def _stringifyText(text): acceptedTypes = (str, int, float, bool) if not isinstance(text, acceptedTypes): raise PyperclipException( - "only str, int, float, and bool values" - "can be copied to the clipboard, not".format(text.__class__.__name__) + f"only str, int, float, and bool values" + f"can be copied to the clipboard, not {text.__class__.__name__}" ) return str(text) @@ -599,9 +599,9 @@ def set_clipboard(clipboard): } if clipboard not in clipboard_types: + allowed_clipboard_types = [repr(_) for _ in clipboard_types.keys()] raise ValueError( - "Argument must be one of %s" - % (", ".join([repr(_) for _ in clipboard_types.keys()])) + f"Argument must be one of {', '.join(allowed_clipboard_types)}" ) # Sets pyperclip's copy() and paste() functions: diff --git a/pandas/io/common.py b/pandas/io/common.py index 0bef14e4999c7..c0eddb679c6f8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -109,7 +109,7 @@ def _is_url(url) -> bool: def _expand_user( - filepath_or_buffer: FilePathOrBuffer[AnyStr] + filepath_or_buffer: FilePathOrBuffer[AnyStr], ) -> FilePathOrBuffer[AnyStr]: """Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -139,7 +139,7 @@ def _validate_header_arg(header) -> None: def _stringify_path( - filepath_or_buffer: FilePathOrBuffer[AnyStr] + filepath_or_buffer: FilePathOrBuffer[AnyStr], ) -> FilePathOrBuffer[AnyStr]: """Attempt to convert a path-like object to a string. @@ -418,7 +418,7 @@ def _get_handle( except ImportError: need_text_wrapping = BufferedIOBase # type: ignore - handles = list() # type: List[IO] + handles: List[IO] = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string @@ -528,7 +528,7 @@ def __init__( file: FilePathOrBuffer, mode: str, archive_name: Optional[str] = None, - **kwargs + **kwargs, ): if mode in ["wb", "rb"]: mode = mode.replace("b", "") diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index ab64bc14344f1..7fdca2d65b05d 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -57,8 +57,8 @@ def _check_columns(cols): for i, n in enumerate(map(len, tail)): if n != N: raise AssertionError( - "All columns must have the same length: {0}; " - "column {1} has length {2}".format(N, i, n) + f"All columns must have the same length: {N}; " + f"column {i} has length {n}" ) return N diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d0ab6dd37596c..c442f0d9bf66c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -8,7 +8,7 @@ from pandas._config import config from pandas.errors import EmptyDataError -from pandas.util._decorators import Appender, deprecate_kwarg +from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like @@ -165,8 +165,9 @@ result 'foo' If a column or index contains an unparseable date, the entire column or - index will be returned unaltered as an object data type. For non-standard - datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. + index will be returned unaltered as an object data type. If you don`t want to + parse some cells as date just change their type in Excel to "Text". + For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. date_parser : function, optional @@ -188,11 +189,6 @@ Comments out remainder of line. Pass a character or characters to this argument to indicate comments in the input file. Any data between the comment string and the end of the current line is ignored. -skip_footer : int, default 0 - Alias of `skipfooter`. - - .. deprecated:: 0.23.0 - Use `skipfooter` instead. skipfooter : int, default 0 Rows at the end to skip (0-indexed). convert_float : bool, default True @@ -277,7 +273,6 @@ @Appender(_read_excel_doc) -@deprecate_kwarg("skip_footer", "skipfooter") def read_excel( io, sheet_name=0, @@ -300,11 +295,10 @@ def read_excel( date_parser=None, thousands=None, comment=None, - skip_footer=0, skipfooter=0, convert_float=True, mangle_dupe_cols=True, - **kwds + **kwds, ): for arg in ("sheet", "sheetname", "parse_cols"): @@ -344,7 +338,7 @@ def read_excel( skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, - **kwds + **kwds, ) @@ -417,7 +411,7 @@ def parse( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - **kwds + **kwds, ): _validate_header_arg(header) @@ -517,7 +511,7 @@ def parse( skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, - **kwds + **kwds, ) output[asheetname] = parser.read(nrows=nrows) @@ -694,7 +688,7 @@ def __init__( date_format=None, datetime_format=None, mode="w", - **engine_kwargs + **engine_kwargs, ): # validate that this engine can handle the extension if isinstance(path, str): @@ -848,7 +842,7 @@ def parse( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - **kwds + **kwds, ): """ Parse specified sheet(s) into a DataFrame. @@ -886,7 +880,7 @@ def parse( skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, - **kwds + **kwds, ) @property diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 3a67f8306fff1..78054936f50f2 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -61,7 +61,7 @@ def get_sheet_by_name(self, name: str): if table.getAttribute("name") == name: return table - raise ValueError("sheet {} not found".format(name)) + raise ValueError(f"sheet {name} not found") def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: """Parse an ODF Table into a list of lists @@ -76,12 +76,12 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: empty_rows = 0 max_row_len = 0 - table = [] # type: List[List[Scalar]] + table: List[List[Scalar]] = [] for i, sheet_row in enumerate(sheet_rows): sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 - table_row = [] # type: List[Scalar] + table_row: List[Scalar] = [] for j, sheet_cell in enumerate(sheet_cells): if sheet_cell.qname == table_cell_name: diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index d8f5da5ab5bc6..d278c6b3bbef2 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -46,7 +46,8 @@ def save(self): @classmethod def _convert_to_style(cls, style_dict): """ - converts a style_dict to an openpyxl style object + Converts a style_dict to an openpyxl style object. + Parameters ---------- style_dict : style dictionary to convert @@ -72,7 +73,8 @@ def _convert_to_style(cls, style_dict): def _convert_to_style_kwargs(cls, style_dict): """ Convert a style_dict to a set of kwargs suitable for initializing - or updating-on-copy an openpyxl v2 style object + or updating-on-copy an openpyxl v2 style object. + Parameters ---------- style_dict : dict @@ -83,6 +85,7 @@ def _convert_to_style_kwargs(cls, style_dict): 'alignment' 'number_format' 'protection' + Returns ------- style_kwargs : dict @@ -107,7 +110,8 @@ def _convert_to_style_kwargs(cls, style_dict): @classmethod def _convert_to_color(cls, color_spec): """ - Convert ``color_spec`` to an openpyxl v2 Color object + Convert ``color_spec`` to an openpyxl v2 Color object. + Parameters ---------- color_spec : str, dict @@ -120,6 +124,7 @@ def _convert_to_color(cls, color_spec): 'tint' 'index' 'type' + Returns ------- color : openpyxl.styles.Color @@ -135,7 +140,8 @@ def _convert_to_color(cls, color_spec): @classmethod def _convert_to_font(cls, font_dict): """ - Convert ``font_dict`` to an openpyxl v2 Font object + Convert ``font_dict`` to an openpyxl v2 Font object. + Parameters ---------- font_dict : dict @@ -154,6 +160,7 @@ def _convert_to_font(cls, font_dict): 'outline' 'shadow' 'condense' + Returns ------- font : openpyxl.styles.Font @@ -185,11 +192,13 @@ def _convert_to_stop(cls, stop_seq): """ Convert ``stop_seq`` to a list of openpyxl v2 Color objects, suitable for initializing the ``GradientFill`` ``stop`` parameter. + Parameters ---------- stop_seq : iterable An iterable that yields objects suitable for consumption by ``_convert_to_color``. + Returns ------- stop : list of openpyxl.styles.Color @@ -200,7 +209,8 @@ def _convert_to_stop(cls, stop_seq): @classmethod def _convert_to_fill(cls, fill_dict): """ - Convert ``fill_dict`` to an openpyxl v2 Fill object + Convert ``fill_dict`` to an openpyxl v2 Fill object. + Parameters ---------- fill_dict : dict @@ -216,6 +226,7 @@ def _convert_to_fill(cls, fill_dict): 'top' 'bottom' 'stop' + Returns ------- fill : openpyxl.styles.Fill @@ -262,7 +273,8 @@ def _convert_to_fill(cls, fill_dict): @classmethod def _convert_to_side(cls, side_spec): """ - Convert ``side_spec`` to an openpyxl v2 Side object + Convert ``side_spec`` to an openpyxl v2 Side object. + Parameters ---------- side_spec : str, dict @@ -270,6 +282,7 @@ def _convert_to_side(cls, side_spec): of the following keys (or their synonyms). 'style' ('border_style') 'color' + Returns ------- side : openpyxl.styles.Side @@ -295,7 +308,8 @@ def _convert_to_side(cls, side_spec): @classmethod def _convert_to_border(cls, border_dict): """ - Convert ``border_dict`` to an openpyxl v2 Border object + Convert ``border_dict`` to an openpyxl v2 Border object. + Parameters ---------- border_dict : dict @@ -311,6 +325,7 @@ def _convert_to_border(cls, border_dict): 'diagonalUp' ('diagonalup') 'diagonalDown' ('diagonaldown') 'outline' + Returns ------- border : openpyxl.styles.Border @@ -335,7 +350,8 @@ def _convert_to_border(cls, border_dict): @classmethod def _convert_to_alignment(cls, alignment_dict): """ - Convert ``alignment_dict`` to an openpyxl v2 Alignment object + Convert ``alignment_dict`` to an openpyxl v2 Alignment object. + Parameters ---------- alignment_dict : dict @@ -515,7 +531,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: - data = [] # type: List[List[Scalar]] + data: List[List[Scalar]] = [] for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 2ba3842d5c0c9..ee617d2013136 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,5 +1,3 @@ -import warnings - from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_integer, is_list_like @@ -136,16 +134,11 @@ def _maybe_convert_usecols(usecols): return usecols if is_integer(usecols): - warnings.warn( - ( - "Passing in an integer for `usecols` has been " - "deprecated. Please pass in a list of int from " - "0 to `usecols` inclusive instead." - ), - FutureWarning, - stacklevel=2, + raise ValueError( + "Passing an integer for `usecols` is no longer supported. " + "Please pass in a list of int from 0 to `usecols` " + "inclusive instead." ) - return list(range(usecols + 1)) if isinstance(usecols, str): return _range2cols(usecols) diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 07bf265da4863..6d9ff9be5249a 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -168,7 +168,7 @@ def __init__( date_format=None, datetime_format=None, mode="w", - **engine_kwargs + **engine_kwargs, ): # Use the xlsxwriter module as the Excel writer. import xlsxwriter @@ -182,7 +182,7 @@ def __init__( date_format=date_format, datetime_format=datetime_format, mode=mode, - **engine_kwargs + **engine_kwargs, ) self.book = xlsxwriter.Workbook(path, **engine_kwargs) diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index fe3d0a208de6a..996ae1caa14c8 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -77,7 +77,9 @@ def write_cells( wks.write(startrow + cell.row, startcol + cell.col, val, style) @classmethod - def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"): + def _style_to_xlwt( + cls, item, firstlevel: bool = True, field_sep=",", line_sep=";" + ) -> str: """helper which recursively generate an xlwt easy style string for example: @@ -117,6 +119,7 @@ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"): def _convert_to_style(cls, style_dict, num_format_str=None): """ converts a style_dict to an xlwt style object + Parameters ---------- style_dict : style dictionary to convert diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index dd6519275ad15..dffe04fb63720 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -3,14 +3,13 @@ from distutils.version import LooseVersion from pandas.compat._optional import import_optional_dependency -from pandas.util._decorators import deprecate_kwarg from pandas import DataFrame, Int64Index, RangeIndex from pandas.io.common import _stringify_path -def to_feather(df, path): +def to_feather(df: DataFrame, path): """ Write a DataFrame to the feather-format @@ -66,7 +65,6 @@ def to_feather(df, path): feather.write_feather(df, path) -@deprecate_kwarg(old_arg_name="nthreads", new_arg_name="use_threads") def read_feather(path, columns=None, use_threads=True): """ Load a feather-format object from the file path. @@ -89,11 +87,6 @@ def read_feather(path, columns=None, use_threads=True): If not provided, all columns are read. .. versionadded:: 0.24.0 - nthreads : int, default 1 - Number of CPU threads to use when reading to pandas.DataFrame. - - .. versionadded:: 0.21.0 - .. deprecated:: 0.24.0 use_threads : bool, default True Whether to parallelize reading using multiple threads. diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e25862537cbfc..f0493036b934a 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -327,7 +327,7 @@ def _save(self): self._save_chunk(start_i, end_i) - def _save_chunk(self, start_i, end_i): + def _save_chunk(self, start_i: int, end_i: int): data_index = self.data_index diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index b9c847ad64c57..cd0889044094f 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -63,8 +63,9 @@ def __init__(self, inherited=None): compute_css = CSSResolver() - def __call__(self, declarations_str): - """Convert CSS declarations to ExcelWriter style + def __call__(self, declarations_str: str): + """ + Convert CSS declarations to ExcelWriter style. Parameters ---------- @@ -279,6 +280,7 @@ def build_font(self, props): if "text-shadow" in props else None ), + # FIXME: dont leave commented-out # 'vertAlign':, # 'charset': , # 'scheme': , @@ -665,7 +667,7 @@ def _format_hierarchical_rows(self): for cell in self._generate_body(gcolidx): yield cell - def _generate_body(self, coloffset): + def _generate_body(self, coloffset: int): if self.styler is None: styles = None else: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 17603809c2ea6..b18f0db622b3e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -262,6 +262,8 @@ def __init__( def _chk_truncate(self) -> None: from pandas.core.reshape.concat import concat + self.tr_row_num: Optional[int] + min_rows = self.min_rows max_rows = self.max_rows # truncation determined by max_rows, actual truncated number of rows @@ -280,7 +282,7 @@ def _chk_truncate(self) -> None: else: row_num = max_rows // 2 series = concat((series.iloc[:row_num], series.iloc[-row_num:])) - self.tr_row_num = row_num # type: Optional[int] + self.tr_row_num = row_num else: self.tr_row_num = None self.tr_series = series @@ -448,13 +450,13 @@ def _get_adjustment() -> TextAdjustment: class TableFormatter: - show_dimensions = None # type: bool - is_truncated = None # type: bool - formatters = None # type: formatters_type - columns = None # type: Index + show_dimensions: bool + is_truncated: bool + formatters: formatters_type + columns: Index @property - def should_show_dimensions(self) -> Optional[bool]: + def should_show_dimensions(self) -> bool: return self.show_dimensions is True or ( self.show_dimensions == "truncate" and self.is_truncated ) @@ -616,6 +618,8 @@ def _chk_truncate(self) -> None: # Cut the data to the information actually printed max_cols = self.max_cols max_rows = self.max_rows + self.max_rows_adj: Optional[int] + max_rows_adj: Optional[int] if max_cols == 0 or max_rows == 0: # assume we are in the terminal (w, h) = get_terminal_size() @@ -631,7 +635,7 @@ def _chk_truncate(self) -> None: self.header = cast(bool, self.header) n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row # rows available to fill with actual data - max_rows_adj = self.h - n_add_rows # type: Optional[int] + max_rows_adj = self.h - n_add_rows self.max_rows_adj = max_rows_adj # Format only rows and columns that could potentially fit the @@ -1073,7 +1077,7 @@ def _get_formatted_index(self, frame: "DataFrame") -> List[str]: return adjoined def _get_column_name_list(self) -> List[str]: - names = [] # type: List[str] + names: List[str] = [] columns = self.frame.columns if isinstance(columns, ABCMultiIndex): names.extend("" if name is None else name for name in columns.names) @@ -1124,8 +1128,9 @@ def format_array( List[str] """ + fmt_klass: Type[GenericArrayFormatter] if is_datetime64_dtype(values.dtype): - fmt_klass = Datetime64Formatter # type: Type[GenericArrayFormatter] + fmt_klass = Datetime64Formatter elif is_datetime64tz_dtype(values): fmt_klass = Datetime64TZFormatter elif is_timedelta64_dtype(values.dtype): @@ -1375,11 +1380,12 @@ def format_values_with(float_format): # There is a special default string when we are fixed-width # The default is otherwise to use str instead of a formatting string + float_format: Optional[float_format_type] if self.float_format is None: if self.fixed_width: float_format = partial( "{value: .{digits:d}f}".format, digits=self.digits - ) # type: Optional[float_format_type] + ) else: float_format = self.float_format else: @@ -1437,7 +1443,7 @@ def __init__( values: Union[np.ndarray, "Series", DatetimeIndex, DatetimeArray], nat_rep: str = "NaT", date_format: None = None, - **kwargs + **kwargs, ): super().__init__(values, **kwargs) self.nat_rep = nat_rep @@ -1658,7 +1664,7 @@ def __init__( values: Union[np.ndarray, TimedeltaIndex], nat_rep: str = "NaT", box: bool = False, - **kwargs + **kwargs, ): super().__init__(values, **kwargs) self.nat_rep = nat_rep diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 38f2e332017f0..0c6b0c1a5810b 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -45,7 +45,7 @@ def __init__( self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns - self.elements = [] # type: List[str] + self.elements: List[str] = [] self.bold_rows = self.fmt.bold_rows self.escape = self.fmt.escape self.show_dimensions = self.fmt.show_dimensions @@ -138,11 +138,10 @@ def _write_cell( else: start_tag = "<{kind}>".format(kind=kind) + esc: Union[OrderedDict[str, str], Dict] if self.escape: # escape & first to prevent double escaping of & - esc = OrderedDict( - [("&", r"&"), ("<", r"<"), (">", r">")] - ) # type: Union[OrderedDict[str, str], Dict] + esc = OrderedDict([("&", r"&"), ("<", r"<"), (">", r">")]) else: esc = {} @@ -408,7 +407,7 @@ def _write_regular_rows( else: index_values = self.fmt.tr_frame.index.format() - row = [] # type: List[str] + row: List[str] = [] for i in range(nrows): if truncate_v and i == (self.fmt.tr_row_num): diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index ca9db88ae7be4..008a99427f3c7 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -11,8 +11,8 @@ class LatexFormatter(TableFormatter): - """ Used to render a DataFrame to a LaTeX tabular/longtable environment - output. + """ + Used to render a DataFrame to a LaTeX tabular/longtable environment output. Parameters ---------- @@ -106,18 +106,19 @@ def pad_empties(x): # Get rid of old multiindex column and add new ones strcols = out + strcols[1:] - column_format = self.column_format - if column_format is None: + if self.column_format is None: dtypes = self.frame.dtypes._values column_format = "".join(map(get_col_type, dtypes)) if self.fmt.index: index_format = "l" * self.frame.index.nlevels column_format = index_format + column_format - elif not isinstance(column_format, str): # pragma: no cover + elif not isinstance(self.column_format, str): # pragma: no cover raise AssertionError( "column_format must be str or unicode, " "not {typ}".format(typ=type(column_format)) ) + else: + column_format = self.column_format if self.longtable: self._write_longtable_begin(buf, column_format) @@ -132,7 +133,7 @@ def pad_empties(x): if self.fmt.has_index_names and self.fmt.show_index_names: nlevels += 1 strrows = list(zip(*strcols)) - self.clinebuf = [] # type: List[List[int]] + self.clinebuf: List[List[int]] = [] for i, row in enumerate(strrows): if i == nlevels and self.fmt.header: @@ -265,7 +266,7 @@ def _format_multirow( def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: """ - Print clines after multirow-blocks are finished + Print clines after multirow-blocks are finished. """ for cl in self.clinebuf: if cl[0] == i: @@ -273,7 +274,7 @@ def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: # remove entries that have been written to buffer self.clinebuf = [x for x in self.clinebuf if x[0] != i] - def _write_tabular_begin(self, buf, column_format): + def _write_tabular_begin(self, buf, column_format: str): """ Write the beginning of a tabular environment or nested table/tabular environments including caption and label. @@ -283,11 +284,10 @@ def _write_tabular_begin(self, buf, column_format): buf : string or file handle File path or object. If not specified, the result is returned as a string. - column_format : str, default None + column_format : str The columns format as specified in `LaTeX table format `__ e.g 'rcl' for 3 columns - """ if self.caption is not None or self.label is not None: # then write output in a nested table/tabular environment @@ -327,7 +327,7 @@ def _write_tabular_end(self, buf): else: pass - def _write_longtable_begin(self, buf, column_format): + def _write_longtable_begin(self, buf, column_format: str): """ Write the beginning of a longtable environment including caption and label if provided by user. @@ -337,11 +337,10 @@ def _write_longtable_begin(self, buf, column_format): buf : string or file handle File path or object. If not specified, the result is returned as a string. - column_format : str, default None + column_format : str The columns format as specified in `LaTeX table format `__ e.g 'rcl' for 3 columns - """ buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 061103820ca83..a4f1488fb6b69 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -513,7 +513,7 @@ def format_object_attrs( list of 2-tuple """ - attrs = [] # type: List[Tuple[str, Union[str, int]]] + attrs: List[Tuple[str, Union[str, int]]] = [] if hasattr(obj, "dtype") and include_dtype: # error: "Sequence[Any]" has no attribute "dtype" attrs.append(("dtype", "'{}'".format(obj.dtype))) # type: ignore diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index dce0afd8670b2..ebe86a7f535cb 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -8,7 +8,7 @@ import copy from functools import partial from itertools import product -from typing import Optional +from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple from uuid import uuid1 import numpy as np @@ -71,6 +71,11 @@ class Styler: The ``id`` takes the form ``T__row_col`` where ```` is the unique identifier, ```` is the row number and ```` is the column number. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 Attributes ---------- @@ -126,9 +131,10 @@ def __init__( caption=None, table_attributes=None, cell_ids=True, + na_rep: Optional[str] = None, ): - self.ctx = defaultdict(list) - self._todo = [] + self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) + self._todo: List[Tuple[Callable, Tuple, Dict]] = [] if not isinstance(data, (pd.Series, pd.DataFrame)): raise TypeError("``data`` must be a Series or DataFrame") @@ -149,19 +155,24 @@ def __init__( self.precision = precision self.table_attributes = table_attributes self.hidden_index = False - self.hidden_columns = [] + self.hidden_columns: Sequence[int] = [] self.cell_ids = cell_ids + self.na_rep = na_rep # display_funcs maps (row, col) -> formatting function def default_display_func(x): - if is_float(x): + if self.na_rep is not None and pd.isna(x): + return self.na_rep + elif is_float(x): display_format = "{0:.{precision}f}".format(x, precision=self.precision) return display_format else: return x - self._display_funcs = defaultdict(lambda: default_display_func) + self._display_funcs: DefaultDict[ + Tuple[int, int], Callable[[Any], str] + ] = defaultdict(lambda: default_display_func) def _repr_html_(self): """ @@ -416,16 +427,22 @@ def format_attr(pair): table_attributes=table_attr, ) - def format(self, formatter, subset=None): + def format(self, formatter, subset=None, na_rep: Optional[str] = None): """ Format the text display value of cells. Parameters ---------- - formatter : str, callable, or dict + formatter : str, callable, dict or None + If ``formatter`` is None, the default formatter is used subset : IndexSlice An argument to ``DataFrame.loc`` that restricts which elements ``formatter`` is applied to. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 Returns ------- @@ -451,6 +468,10 @@ def format(self, formatter, subset=None): >>> df['c'] = ['a', 'b', 'c', 'd'] >>> df.style.format({'c': str.upper}) """ + if formatter is None: + assert self._display_funcs.default_factory is not None + formatter = self._display_funcs.default_factory() + if subset is None: row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) @@ -466,16 +487,16 @@ def format(self, formatter, subset=None): if is_dict_like(formatter): for col, col_formatter in formatter.items(): # formatter must be callable, so '{}' are converted to lambdas - col_formatter = _maybe_wrap_formatter(col_formatter) + col_formatter = _maybe_wrap_formatter(col_formatter, na_rep) col_num = self.data.columns.get_indexer_for([col])[0] for row_num in row_locs: self._display_funcs[(row_num, col_num)] = col_formatter else: # single scalar to format all cells with + formatter = _maybe_wrap_formatter(formatter, na_rep) locs = product(*(row_locs, col_locs)) for i, j in locs: - formatter = _maybe_wrap_formatter(formatter) self._display_funcs[(i, j)] = formatter return self @@ -553,6 +574,7 @@ def _copy(self, deepcopy=False): caption=self.caption, uuid=self.uuid, table_styles=self.table_styles, + na_rep=self.na_rep, ) if deepcopy: styler.ctx = copy.deepcopy(self.ctx) @@ -896,6 +918,23 @@ def set_table_styles(self, table_styles): self.table_styles = table_styles return self + def set_na_rep(self, na_rep: str) -> "Styler": + """ + Set the missing data representation on a Styler. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + na_rep : str + + Returns + ------- + self : Styler + """ + self.na_rep = na_rep + return self + def hide_index(self): """ Hide any indices from rendering. @@ -1450,7 +1489,7 @@ def _get_level_lengths(index, hidden_elements=None): Optional argument is a list of index positions which should not be visible. - Result is a dictionary of (level, inital_position): span + Result is a dictionary of (level, initial_position): span """ sentinel = object() levels = index.format(sparsify=sentinel, adjoin=False, names=False) @@ -1487,14 +1526,22 @@ def _get_level_lengths(index, hidden_elements=None): return non_zero_lengths -def _maybe_wrap_formatter(formatter): +def _maybe_wrap_formatter(formatter, na_rep: Optional[str]): if isinstance(formatter, str): - return lambda x: formatter.format(x) + formatter_func = lambda x: formatter.format(x) elif callable(formatter): - return formatter + formatter_func = formatter else: msg = ( "Expected a template string or callable, got {formatter} " "instead".format(formatter=formatter) ) raise TypeError(msg) + + if na_rep is None: + return formatter_func + elif isinstance(na_rep, str): + return lambda x: na_rep if pd.isna(x) else formatter_func(x) + else: + msg = "Expected a string, got {na_rep} instead".format(na_rep=na_rep) + raise TypeError(msg) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index d29078cad9318..b120de1b3011a 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -173,7 +173,7 @@ def read_gbq( location=location, configuration=configuration, credentials=credentials, - **kwargs + **kwargs, ) diff --git a/pandas/io/html.py b/pandas/io/html.py index 9a368907b65aa..5f38f866e1643 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -57,7 +57,7 @@ def _importers(): _RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") -def _remove_whitespace(s, regex=_RE_WHITESPACE): +def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str: """ Replace extra whitespace inside of a string with a single space. @@ -65,8 +65,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): ---------- s : str or unicode The string from which to remove extra whitespace. - - regex : regex + regex : re.Pattern The regular expression to use to remove extra whitespace. Returns @@ -253,7 +252,8 @@ def _text_getter(self, obj): raise AbstractMethodError(self) def _parse_td(self, obj): - """Return the td elements from a row element. + """ + Return the td elements from a row element. Parameters ---------- @@ -600,7 +600,7 @@ def _build_doc(self): ) -def _build_xpath_expr(attrs): +def _build_xpath_expr(attrs) -> str: """Build an xpath expression to simulate bs4's ability to pass in kwargs to search for attributes when using the lxml parser. @@ -810,7 +810,8 @@ def _data_to_frame(**kwargs): def _parser_dispatch(flavor): - """Choose the parser based on the input flavor. + """ + Choose the parser based on the input flavor. Parameters ---------- @@ -850,7 +851,7 @@ def _parser_dispatch(flavor): return _valid_parsers[flavor] -def _print_as_set(s): +def _print_as_set(s) -> str: return "{" + "{arg}".format(arg=", ".join(pprint_thing(el) for el in s)) + "}" @@ -895,7 +896,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): try: tables = p.parse_tables() - except Exception as caught: + except ValueError as caught: # if `io` is an io-like object, check if it's seekable # and try to rewind it before trying the next parser if hasattr(io, "seekable") and io.seekable(): diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 8615355996031..89d5b52ffbf1e 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -62,8 +62,10 @@ def to_json( if orient == "table" and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or "values") + + writer: Type["Writer"] if orient == "table" and isinstance(obj, DataFrame): - writer = JSONTableWriter # type: Type["Writer"] + writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter elif isinstance(obj, DataFrame): @@ -577,6 +579,8 @@ def read_json( dtype = True if convert_axes is None and orient != "table": convert_axes = True + if encoding is None: + encoding = "utf-8" compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( @@ -709,7 +713,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): return data - def _combine_lines(self, lines): + def _combine_lines(self, lines) -> str: """ Combines a list of JSON objects into one JSON object. """ @@ -1167,7 +1171,7 @@ def _try_convert_dates(self): convert_dates = [] convert_dates = set(convert_dates) - def is_ok(col): + def is_ok(col) -> bool: """ Return if this col is ok to try for a date parse. """ diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 702241bde2b34..df513d4d37d71 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -267,10 +267,10 @@ def _pull_field(js, spec): meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now - records = [] # type: List + records: List = [] lengths = [] - meta_vals = defaultdict(list) # type: DefaultDict + meta_vals: DefaultDict = defaultdict(list) meta_keys = [sep.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 69ee6583d12c8..edbf60cc91d0b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -89,7 +89,7 @@ def write( coerce_timestamps="ms", index=None, partition_cols=None, - **kwargs + **kwargs, ): self.validate_dataframe(df) path, _, _, _ = get_filepath_or_buffer(path, mode="wb") @@ -106,7 +106,7 @@ def write( compression=compression, coerce_timestamps=coerce_timestamps, partition_cols=partition_cols, - **kwargs + **kwargs, ) else: self.api.parquet.write_table( @@ -114,7 +114,7 @@ def write( path, compression=compression, coerce_timestamps=coerce_timestamps, - **kwargs + **kwargs, ) def read(self, path, columns=None, **kwargs): @@ -176,7 +176,7 @@ def write( compression=compression, write_index=index, partition_on=partition_cols, - **kwargs + **kwargs, ) def read(self, path, columns=None, **kwargs): @@ -205,7 +205,7 @@ def to_parquet( compression="snappy", index=None, partition_cols=None, - **kwargs + **kwargs, ): """ Write a DataFrame to the parquet format. @@ -252,7 +252,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, - **kwargs + **kwargs, ) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 058d65b9464ae..bbec148b8745d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -395,25 +395,22 @@ def _validate_integer(name, val, min_val=0): def _validate_names(names): """ - Check if the `names` parameter contains duplicates. - - If duplicates are found, we issue a warning before returning. + Raise ValueError if the `names` parameter contains duplicates. Parameters ---------- names : array-like or None An array containing a list of the names used for the output DataFrame. - Returns - ------- - names : array-like or None - The original `names` parameter. + Raises + ------ + ValueError + If names are not unique. """ if names is not None: if len(names) != len(set(names)): raise ValueError("Duplicate names are not allowed.") - return names def _read(filepath_or_buffer: FilePathOrBuffer, kwds): @@ -491,7 +488,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "cache_dates": True, "thousands": None, "comment": None, - "decimal": b".", + "decimal": ".", # 'engine': 'c', "parse_dates": False, "keep_date_col": False, @@ -525,8 +522,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} -_deprecated_defaults = {} # type: Dict[str, Any] -_deprecated_args = set() # type: Set[str] +_deprecated_defaults: Dict[str, Any] = {} +_deprecated_args: Set[str] = set() def _make_parser_function(name, default_sep=","): @@ -571,7 +568,7 @@ def parser_f( # Quoting, Compression, and File Format compression="infer", thousands=None, - decimal=b".", + decimal: str = ".", lineterminator=None, quotechar='"', quoting=csv.QUOTE_MINIMAL, @@ -707,7 +704,7 @@ def read_fwf( colspecs="infer", widths=None, infer_nrows=100, - **kwds + **kwds, ): r""" @@ -1605,7 +1602,7 @@ def ix(col): # remove index items from content and columns, don't pop in # loop - for i in reversed(sorted(to_remove)): + for i in sorted(to_remove, reverse=True): data.pop(i) if not self._implicit_index: columns.pop(i) @@ -1637,7 +1634,7 @@ def _get_name(icol): # remove index items from content and columns, don't pop in # loop - for c in reversed(sorted(to_remove)): + for c in sorted(to_remove, reverse=True): data.pop(c) col_names.remove(c) @@ -1918,7 +1915,12 @@ def __init__(self, src, **kwds): else: if len(self._reader.header) > 1: # we have a multi index in the columns - self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( # noqa: E501 + ( + self.names, + self.index_names, + self.col_names, + passed_names, + ) = self._extract_multi_indexer_columns( self._reader.header, self.index_names, self.col_names, passed_names ) else: @@ -2307,7 +2309,12 @@ def __init__(self, f, **kwds): # The original set is stored in self.original_columns. if len(self.columns) > 1: # we are processing a multi index column - self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( # noqa: E501 + ( + self.columns, + self.index_names, + self.col_names, + _, + ) = self._extract_multi_indexer_columns( self.columns, self.index_names, self.col_names ) # Update list of original names to include all indices. diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 5e066c4f9ecbd..0a0ccedd78f00 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -2,7 +2,7 @@ import pickle import warnings -from pandas.compat import PY36, pickle_compat as pc +from pandas.compat import pickle_compat as pc from pandas.io.common import _get_handle, _stringify_path @@ -140,9 +140,7 @@ def read_pickle(path, compression="infer"): # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes - excs_to_catch = (AttributeError, ImportError) - if PY36: - excs_to_catch += (ModuleNotFoundError,) + excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError) try: with warnings.catch_warnings(record=True): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e98802888e582..b229e5b4e0f4e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -9,7 +9,7 @@ import os import re import time -from typing import List, Optional, Type, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union import warnings import numpy as np @@ -48,13 +48,17 @@ from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.sparse import BlockIndex, IntIndex import pandas.core.common as com -from pandas.core.computation.pytables import Expr, maybe_expression +from pandas.core.computation.pytables import PyTablesExpr, maybe_expression from pandas.core.index import ensure_index from pandas.core.internals import BlockManager, _block_shape, make_block from pandas.io.common import _stringify_path from pandas.io.formats.printing import adjoin, pprint_thing +if TYPE_CHECKING: + from tables import File # noqa:F401 + + # versioning attribute _version = "0.15.2" @@ -89,10 +93,10 @@ def _ensure_str(name): return name -Term = Expr +Term = PyTablesExpr -def _ensure_term(where, scope_level): +def _ensure_term(where, scope_level: int): """ ensure that the where is a Term or a list of Term this makes sure that we are capturing the scope of variables @@ -175,9 +179,6 @@ class DuplicateWarning(Warning): # storer class map _STORER_MAP = { - "Series": "LegacySeriesFixed", - "DataFrame": "LegacyFrameFixed", - "DataMatrix": "LegacyFrameFixed", "series": "SeriesFixed", "frame": "FrameFixed", } @@ -252,7 +253,7 @@ def to_hdf( complevel=None, complib=None, append=None, - **kwargs + **kwargs, ): """ store this object, close it if we opened it """ @@ -271,7 +272,7 @@ def to_hdf( f(path_or_buf) -def read_hdf(path_or_buf, key=None, mode="r", **kwargs): +def read_hdf(path_or_buf, key=None, mode: str = "r", **kwargs): """ Read from the store, close it if we opened it. @@ -340,8 +341,8 @@ def read_hdf(path_or_buf, key=None, mode="r", **kwargs): if mode not in ["r", "r+", "a"]: raise ValueError( - "mode {0} is not allowed while performing a read. " - "Allowed modes are r, r+ and a.".format(mode) + f"mode {mode} is not allowed while performing a read. " + f"Allowed modes are r, r+ and a." ) # grab the scope if "where" in kwargs: @@ -367,9 +368,7 @@ def read_hdf(path_or_buf, key=None, mode="r", **kwargs): exists = False if not exists: - raise FileNotFoundError( - "File {path} does not exist".format(path=path_or_buf) - ) + raise FileNotFoundError(f"File {path_or_buf} does not exist") store = HDFStore(path_or_buf, mode=mode, **kwargs) # can't auto open/close if we are using an iterator @@ -406,7 +405,7 @@ def read_hdf(path_or_buf, key=None, mode="r", **kwargs): raise -def _is_metadata_of(group, parent_group): +def _is_metadata_of(group, parent_group) -> bool: """Check if a given group is a metadata group for a given parent_group.""" if group._v_depth <= parent_group._v_depth: return False @@ -465,8 +464,16 @@ class HDFStore: >>> store.close() """ + _handle: Optional["File"] + def __init__( - self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs + self, + path, + mode=None, + complevel=None, + complib=None, + fletcher32: bool = False, + **kwargs, ): if "format" in kwargs: @@ -476,9 +483,7 @@ def __init__( if complib is not None and complib not in tables.filters.all_complibs: raise ValueError( - "complib only supports {libs} compression.".format( - libs=tables.filters.all_complibs - ) + f"complib only supports {tables.filters.all_complibs} compression." ) if complib is None and complevel is not None: @@ -508,28 +513,26 @@ def root(self): def filename(self): return self._path - def __getitem__(self, key): + def __getitem__(self, key: str): return self.get(key) - def __setitem__(self, key, value): + def __setitem__(self, key: str, value): self.put(key, value) - def __delitem__(self, key): + def __delitem__(self, key: str): return self.remove(key) - def __getattr__(self, name): + def __getattr__(self, name: str): """ allow attribute access to get stores """ try: return self.get(name) except (KeyError, ClosedFileError): pass raise AttributeError( - "'{object}' object has no attribute '{name}'".format( - object=type(self).__name__, name=name - ) + f"'{type(self).__name__}' object has no attribute '{name}'" ) - def __contains__(self, key): + def __contains__(self, key: str): """ check for existence of this key can match the exact pathname or the pathnm w/o the leading '/' """ @@ -540,13 +543,12 @@ def __contains__(self, key): return True return False - def __len__(self): + def __len__(self) -> int: return len(self.groups()) def __repr__(self) -> str: - return "{type}\nFile path: {path}\n".format( - type=type(self), path=pprint_thing(self._path) - ) + pstr = pprint_thing(self._path) + return f"{type(self)}\nFile path: {pstr}\n" def __enter__(self): return self @@ -554,7 +556,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self.close() - def keys(self): + def keys(self) -> List[str]: """ Return a list of keys corresponding to objects stored in HDFStore. @@ -577,7 +579,7 @@ def items(self): iteritems = items - def open(self, mode="a", **kwargs): + def open(self, mode: str = "a", **kwargs): """ Open the file in the specified mode @@ -598,8 +600,8 @@ def open(self, mode="a", **kwargs): # this would truncate, raise here if self.is_open: raise PossibleDataLossError( - "Re-opening the file [{0}] with mode [{1}] " - "will delete the current file!".format(self._path, self._mode) + f"Re-opening the file [{self._path}] with mode [{self._mode}] " + "will delete the current file!" ) self._mode = mode @@ -615,40 +617,38 @@ def open(self, mode="a", **kwargs): try: self._handle = tables.open_file(self._path, self._mode, **kwargs) - except (IOError) as e: # pragma: no cover - if "can not be written" in str(e): - print("Opening {path} in read-only mode".format(path=self._path)) + except IOError as err: # pragma: no cover + if "can not be written" in str(err): + print(f"Opening {self._path} in read-only mode") self._handle = tables.open_file(self._path, "r", **kwargs) else: raise - except (ValueError) as e: + except ValueError as err: # trap PyTables >= 3.1 FILE_OPEN_POLICY exception # to provide an updated message - if "FILE_OPEN_POLICY" in str(e): - e = ValueError( - "PyTables [{version}] no longer supports opening multiple " - "files\n" + if "FILE_OPEN_POLICY" in str(err): + hdf_version = tables.get_hdf5_version() + err = ValueError( + f"PyTables [{tables.__version__}] no longer supports " + "opening multiple files\n" "even in read-only mode on this HDF5 version " - "[{hdf_version}]. You can accept this\n" + f"[{hdf_version}]. You can accept this\n" "and not open the same file multiple times at once,\n" "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 " "which allows\n" - "files to be opened multiple times at once\n".format( - version=tables.__version__, - hdf_version=tables.get_hdf5_version(), - ) + "files to be opened multiple times at once\n" ) - raise e + raise err - except (Exception) as e: + except Exception as err: # trying to read from a non-existent file causes an error which # is not part of IOError, make it one - if self._mode == "r" and "Unable to open/create file" in str(e): - raise IOError(str(e)) + if self._mode == "r" and "Unable to open/create file" in str(err): + raise IOError(str(err)) raise def close(self): @@ -660,7 +660,7 @@ def close(self): self._handle = None @property - def is_open(self): + def is_open(self) -> bool: """ return a boolean indicating whether the file is open """ @@ -668,7 +668,7 @@ def is_open(self): return False return bool(self._handle.isopen) - def flush(self, fsync=False): + def flush(self, fsync: bool = False): """ Force all buffered modifications to be written to disk. @@ -692,13 +692,13 @@ def flush(self, fsync=False): except OSError: pass - def get(self, key): + def get(self, key: str): """ Retrieve pandas object stored in file. Parameters ---------- - key : object + key : str Returns ------- @@ -707,27 +707,27 @@ def get(self, key): """ group = self.get_node(key) if group is None: - raise KeyError("No object named {key} in the file".format(key=key)) + raise KeyError(f"No object named {key} in the file") return self._read_group(group) def select( self, - key, + key: str, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, - auto_close=False, - **kwargs + auto_close: bool = False, + **kwargs, ): """ Retrieve pandas object stored in file, optionally based on where criteria. Parameters ---------- - key : object + key : str Object being retrieved from file. where : list, default None List of Term (or convertible) objects, optional. @@ -751,7 +751,7 @@ def select( """ group = self.get_node(key) if group is None: - raise KeyError("No object named {key} in the file".format(key=key)) + raise KeyError(f"No object named {key} in the file") # create the storer and axes where = _ensure_term(where, scope_level=1) @@ -778,13 +778,20 @@ def func(_start, _stop, _where): return it.get_result() - def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates( + self, + key: str, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + **kwargs, + ): """ return the selection as an Index Parameters ---------- - key : object + key : str where : list of Term (or convertible) objects, optional start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection @@ -794,15 +801,16 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs where=where, start=start, stop=stop, **kwargs ) - def select_column(self, key, column, **kwargs): + def select_column(self, key: str, column: str, **kwargs): """ return a single column from the table. This is generally only useful to select an indexable Parameters ---------- - key : object - column: the column of interest + key : str + column: str + The column of interest. Raises ------ @@ -824,10 +832,11 @@ def select_as_multiple( stop=None, iterator=False, chunksize=None, - auto_close=False, - **kwargs + auto_close: bool = False, + **kwargs, ): - """ Retrieve pandas objects from multiple tables + """ + Retrieve pandas objects from multiple tables. Parameters ---------- @@ -839,6 +848,8 @@ def select_as_multiple( stop : integer (defaults to None), row number to stop selection iterator : boolean, return an iterator, default False chunksize : nrows to include in iteration, return an iterator + auto_close : bool, default False + Should automatically close the store when finished. Raises ------ @@ -860,7 +871,7 @@ def select_as_multiple( stop=stop, iterator=iterator, chunksize=chunksize, - **kwargs + **kwargs, ) if not isinstance(keys, (list, tuple)): @@ -880,11 +891,11 @@ def select_as_multiple( nrows = None for t, k in itertools.chain([(s, selector)], zip(tbls, keys)): if t is None: - raise KeyError("Invalid table [{key}]".format(key=k)) + raise KeyError(f"Invalid table [{k}]") if not t.is_table: raise TypeError( - "object [{obj}] is not a table, and cannot be used in all " - "select as multiple".format(obj=t.pathname) + f"object [{t.pathname}] is not a table, and cannot be used in all " + "select as multiple" ) if nrows is None: @@ -925,13 +936,13 @@ def func(_start, _stop, _where): return it.get_result(coordinates=True) - def put(self, key, value, format=None, append=False, **kwargs): + def put(self, key: str, value, format=None, append=False, **kwargs): """ Store object in HDFStore. Parameters ---------- - key : object + key : str value : {Series, DataFrame} format : 'fixed(f)|table(t)', default is 'fixed' fixed(f) : Fixed format @@ -957,7 +968,7 @@ def put(self, key, value, format=None, append=False, **kwargs): kwargs = self._validate_format(format, kwargs) self._write_to_group(key, value, append=append, **kwargs) - def remove(self, key, where=None, start=None, stop=None): + def remove(self, key: str, where=None, start=None, stop=None): """ Remove pandas object partially by specifying the where condition @@ -1010,7 +1021,14 @@ def remove(self, key, where=None, start=None, stop=None): return s.delete(where=where, start=start, stop=stop) def append( - self, key, value, format=None, append=True, columns=None, dropna=None, **kwargs + self, + key: str, + value, + format=None, + append=True, + columns=None, + dropna=None, + **kwargs, ): """ Append to Table in file. Node must already exist and be Table @@ -1018,7 +1036,7 @@ def append( Parameters ---------- - key : object + key : str value : {Series, DataFrame} format : 'table' is the default table(t) : table format @@ -1059,7 +1077,14 @@ def append( self._write_to_group(key, value, append=append, dropna=dropna, **kwargs) def append_to_multiple( - self, d, value, selector, data_columns=None, axes=None, dropna=False, **kwargs + self, + d: Dict, + value, + selector, + data_columns=None, + axes=None, + dropna=False, + **kwargs, ): """ Append to multiple tables @@ -1105,7 +1130,7 @@ def append_to_multiple( # figure out how to split the value remain_key = None - remain_values = [] + remain_values: List = [] for k, v in d.items(): if v is None: if remain_key is not None: @@ -1143,16 +1168,17 @@ def append_to_multiple( self.append(k, val, data_columns=dc, **kwargs) - def create_table_index(self, key, **kwargs): - """ Create a pytables index on the table + def create_table_index(self, key: str, **kwargs): + """ + Create a pytables index on the table. + Parameters ---------- - key : object (the node to index) + key : str Raises ------ - raises if the node is not a table - + TypeError: raises if the node is not a table """ # version requirements @@ -1238,21 +1264,23 @@ def walk(self, where="/"): yield (g._v_pathname.rstrip("/"), groups, leaves) - def get_node(self, key): + def get_node(self, key: str): """ return the node with the key or None if it does not exist """ self._check_if_open() + if not key.startswith("/"): + key = "/" + key + + assert self._handle is not None try: - if not key.startswith("/"): - key = "/" + key return self._handle.get_node(self.root, key) - except _table_mod.exceptions.NoSuchNodeError: + except _table_mod.exceptions.NoSuchNodeError: # type: ignore return None - def get_storer(self, key): + def get_storer(self, key: str): """ return the storer object for a key, raise if not in the file """ group = self.get_node(key) if group is None: - raise KeyError("No object named {key} in the file".format(key=key)) + raise KeyError(f"No object named {key} in the file") s = self._create_storer(group) s.infer_axes() @@ -1262,27 +1290,28 @@ def copy( self, file, mode="w", - propindexes=True, + propindexes: bool = True, keys=None, complib=None, complevel=None, - fletcher32=False, + fletcher32: bool = False, overwrite=True, ): - """ copy the existing store to a new file, upgrading in place - - Parameters - ---------- - propindexes: restore indexes in copied file (defaults to True) - keys : list of keys to include in the copy (defaults to all) - overwrite : overwrite (remove and replace) existing nodes in the - new store (default is True) - mode, complib, complevel, fletcher32 same as in HDFStore.__init__ + """ + Copy the existing store to a new file, updating in place. - Returns - ------- - open file handle of the new store + Parameters + ---------- + propindexes: bool, default True + Restore indexes in copied file. + keys : list of keys to include in the copy (defaults to all) + overwrite : overwrite (remove and replace) existing nodes in the + new store (default is True) + mode, complib, complevel, fletcher32 same as in HDFStore.__init__ + Returns + ------- + open file handle of the new store """ new_store = HDFStore( file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 @@ -1302,7 +1331,7 @@ def copy( data = self.select(k) if s.is_table: - index = False + index: Union[bool, list] = False if propindexes: index = [a.name for a in s.axes if a.is_indexed] new_store.append( @@ -1317,7 +1346,7 @@ def copy( return new_store - def info(self): + def info(self) -> str: """ Print detailed information on the store. @@ -1327,11 +1356,11 @@ def info(self): ------- str """ - output = "{type}\nFile path: {path}\n".format( - type=type(self), path=pprint_thing(self._path) - ) + path = pprint_thing(self._path) + output = f"{type(self)}\nFile path: {path}\n" + if self.is_open: - lkeys = sorted(list(self.keys())) + lkeys = sorted(self.keys()) if len(lkeys): keys = [] values = [] @@ -1344,11 +1373,8 @@ def info(self): values.append(pprint_thing(s or "invalid_HDFStore node")) except Exception as detail: keys.append(k) - values.append( - "[invalid_HDFStore node: {detail}]".format( - detail=pprint_thing(detail) - ) - ) + dstr = pprint_thing(detail) + values.append(f"[invalid_HDFStore node: {dstr}]") output += adjoin(12, keys, values) else: @@ -1361,9 +1387,9 @@ def info(self): # private methods ###### def _check_if_open(self): if not self.is_open: - raise ClosedFileError("{0} file is not open!".format(self._path)) + raise ClosedFileError(f"{self._path} file is not open!") - def _validate_format(self, format, kwargs): + def _validate_format(self, format: str, kwargs: Dict[str, Any]) -> Dict[str, Any]: """ validate / deprecate formats; return the new kwargs """ kwargs = kwargs.copy() @@ -1371,7 +1397,7 @@ def _validate_format(self, format, kwargs): try: kwargs["format"] = _FORMAT_MAP[format.lower()] except KeyError: - raise TypeError("invalid HDFStore format specified [{0}]".format(format)) + raise TypeError(f"invalid HDFStore format specified [{format}]") return kwargs @@ -1380,16 +1406,9 @@ def _create_storer(self, group, format=None, value=None, append=False, **kwargs) def error(t): raise TypeError( - "cannot properly create the storer for: [{t}] [group->" - "{group},value->{value},format->{format},append->{append}," - "kwargs->{kwargs}]".format( - t=t, - group=group, - value=type(value), - format=format, - append=append, - kwargs=kwargs, - ) + f"cannot properly create the storer for: [{t}] [group->" + f"{group},value->{type(value)},format->{format},append->{append}," + f"kwargs->{kwargs}]" ) pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) @@ -1471,17 +1490,21 @@ def error(t): def _write_to_group( self, - key, + key: str, value, format, index=True, append=False, complib=None, encoding=None, - **kwargs + **kwargs, ): group = self.get_node(key) + # we make this assertion for mypy; the get_node call will already + # have raised if this is incorrect + assert self._handle is not None + # remove the node if we are not appending if group is not None and not append: self._handle.remove_node(group, recursive=True) @@ -1550,25 +1573,28 @@ class TableIterator: nrows : the rows to iterate on start : the passed start value (default is None) stop : the passed stop value (default is None) - iterator : boolean, whether to use the default iterator - chunksize : the passed chunking value (default is 50000) + iterator : bool, default False + Whether to use the default iterator. + chunksize : the passed chunking value (default is 100000) auto_close : boolean, automatically close the store at the end of iteration, default is False - kwargs : the passed kwargs """ + chunksize: Optional[int] + store: HDFStore + def __init__( self, - store, + store: HDFStore, s, func, where, nrows, start=None, stop=None, - iterator=False, - chunksize=None, - auto_close=False, + iterator: bool = False, + chunksize: Optional[int] = None, + auto_close: bool = False, ): self.store = store self.s = s @@ -1619,7 +1645,7 @@ def close(self): if self.auto_close: self.store.close() - def get_result(self, coordinates=False): + def get_result(self, coordinates: bool = False): # return the actual iterator if self.chunksize is not None: @@ -1662,29 +1688,37 @@ class IndexCol: is_data_indexable = True _info_fields = ["freq", "tz", "index_name"] + name: str + cname: str + kind_attr: str + def __init__( self, + name: str, values=None, kind=None, typ=None, - cname=None, + cname: Optional[str] = None, itemsize=None, - name=None, axis=None, - kind_attr=None, + kind_attr: Optional[str] = None, pos=None, freq=None, tz=None, index_name=None, - **kwargs + **kwargs, ): + + if not isinstance(name, str): + raise ValueError("`name` must be a str.") + self.values = values self.kind = kind self.typ = typ self.itemsize = itemsize self.name = name - self.cname = cname - self.kind_attr = kind_attr + self.cname = cname or name + self.kind_attr = kind_attr or f"{name}_kind" self.axis = axis self.pos = pos self.freq = freq @@ -1694,36 +1728,20 @@ def __init__( self.meta = None self.metadata = None - if name is not None: - self.set_name(name, kind_attr) if pos is not None: self.set_pos(pos) - def set_name(self, name, kind_attr=None): - """ set the name of this indexer """ - self.name = name - self.kind_attr = kind_attr or "{name}_kind".format(name=name) - if self.cname is None: - self.cname = name - - return self - - def set_axis(self, axis): - """ set the axis over which I index """ - self.axis = axis - - return self + # These are ensured as long as the passed arguments match the + # constructor annotations. + assert isinstance(self.name, str) + assert isinstance(self.cname, str) + assert isinstance(self.kind_attr, str) - def set_pos(self, pos): + def set_pos(self, pos: int): """ set the position of this column in the Table """ self.pos = pos if pos is not None and self.typ is not None: self.typ._v_pos = pos - return self - - def set_table(self, table): - self.table = table - return self def __repr__(self) -> str: temp = tuple( @@ -1731,28 +1749,30 @@ def __repr__(self) -> str: ) return ",".join( ( - "{key}->{value}".format(key=key, value=value) + f"{key}->{value}" for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) ) ) - def __eq__(self, other): + def __eq__(self, other) -> bool: """ compare 2 col items """ return all( getattr(self, a, None) == getattr(other, a, None) for a in ["name", "cname", "axis", "pos"] ) - def __ne__(self, other): + def __ne__(self, other) -> bool: return not self.__eq__(other) @property - def is_indexed(self): + def is_indexed(self) -> bool: """ return whether I am an indexed column """ - try: - return getattr(self.table.cols, self.cname).is_indexed - except AttributeError: - False + if not hasattr(self.table, "cols"): + # e.g. if infer hasn't been called yet, self.table will be None. + return False + # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute + # 'error: "None" has no attribute "cols"' + return getattr(self.table.cols, self.cname).is_indexed # type: ignore def copy(self): new_self = copy.copy(self) @@ -1762,12 +1782,14 @@ def infer(self, handler): """infer this column from the table: create and return a new object""" table = handler.table new_self = self.copy() - new_self.set_table(table) + new_self.table = table new_self.get_attr() new_self.read_metadata(handler) return new_self - def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): + def convert( + self, values: np.ndarray, nan_rep, encoding, errors, start=None, stop=None + ): """ set the values from this selection: take = take ownership """ # values is a recarray @@ -1794,8 +1816,6 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): self.values = _set_tz(self.values, self.tz) - return self - def take_data(self): """ return the values & release the memory """ self.values, values = None, self.values @@ -1840,8 +1860,8 @@ def validate(self, handler, append): def validate_names(self): pass - def validate_and_set(self, handler, append): - self.set_table(handler.table) + def validate_and_set(self, handler: "AppendableTable", append: bool): + self.table = handler.table self.validate_col() self.validate_attr(append) self.validate_metadata(handler) @@ -1859,25 +1879,22 @@ def validate_col(self, itemsize=None): itemsize = self.itemsize if c.itemsize < itemsize: raise ValueError( - "Trying to store a string with len [{itemsize}] in " - "[{cname}] column but\nthis column has a limit of " - "[{c_itemsize}]!\nConsider using min_itemsize to " - "preset the sizes on these columns".format( - itemsize=itemsize, cname=self.cname, c_itemsize=c.itemsize - ) + f"Trying to store a string with len [{itemsize}] in " + f"[{self.cname}] column but\nthis column has a limit of " + f"[{c.itemsize}]!\nConsider using min_itemsize to " + "preset the sizes on these columns" ) return c.itemsize return None - def validate_attr(self, append): + def validate_attr(self, append: bool): # check for backwards incompatibility if append: existing_kind = getattr(self.attrs, self.kind_attr, None) if existing_kind is not None and existing_kind != self.kind: raise TypeError( - "incompatible kind in col [{existing} - " - "{self_kind}]".format(existing=existing_kind, self_kind=self.kind) + f"incompatible kind in col [{existing_kind} - {self.kind}]" ) def update_info(self, info): @@ -1903,21 +1920,14 @@ def update_info(self, info): else: raise ValueError( - "invalid info for [{name}] for [{key}], " - "existing_value [{existing_value}] conflicts with " - "new value [{value}]".format( - name=self.name, - key=key, - existing_value=existing_value, - value=value, - ) + f"invalid info for [{self.name}] for [{key}], " + f"existing_value [{existing_value}] conflicts with " + f"new value [{value}]" ) else: if value is not None or existing_value is not None: idx[key] = value - return self - def set_info(self, info): """ set my state from the passed info """ idx = info.get(self.name) @@ -1936,7 +1946,7 @@ def read_metadata(self, handler): """ retrieve the metadata for this columns """ self.metadata = handler.read_metadata(self.cname) - def validate_metadata(self, handler): + def validate_metadata(self, handler: "AppendableTable"): """ validate that kind=category does not change the categories """ if self.meta == "category": new_metadata = self.metadata @@ -1951,7 +1961,7 @@ def validate_metadata(self, handler): "different categories to the existing" ) - def write_metadata(self, handler): + def write_metadata(self, handler: "AppendableTable"): """ set the meta data """ if self.metadata is not None: handler.write_metadata(self.cname, self.metadata) @@ -1961,10 +1971,18 @@ class GenericIndexCol(IndexCol): """ an index which is not represented in the data of the table """ @property - def is_indexed(self): + def is_indexed(self) -> bool: return False - def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): + def convert( + self, + values, + nan_rep, + encoding, + errors, + start: Optional[int] = None, + stop: Optional[int] = None, + ): """ set the values from this selection: take = take ownership Parameters @@ -1980,12 +1998,11 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): Table row number: the end of the sub-selection. Values larger than the underlying table's row count are normalized to that. """ + assert self.table is not None # for mypy - start = start if start is not None else 0 - stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows - self.values = Int64Index(np.arange(stop - start)) - - return self + _start = start if start is not None else 0 + _stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows + self.values = Int64Index(np.arange(_stop - _start)) def get_attr(self): pass @@ -2016,7 +2033,7 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs) """ return a new datacol with the block i """ if cname is None: - cname = name or "values_block_{idx}".format(idx=i) + cname = name or f"values_block_{i}" if name is None: name = cname @@ -2026,7 +2043,8 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs) if version[0] == 0 and version[1] <= 10 and version[2] == 0: m = re.search(r"values_block_(\d+)", name) if m: - name = "values_{group}".format(group=m.groups()[0]) + grp = m.groups()[0] + name = f"values_{grp}" except IndexError: pass @@ -2042,13 +2060,13 @@ def __init__( meta=None, metadata=None, block=None, - **kwargs + **kwargs, ): super().__init__(values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None - self.dtype_attr = "{name}_dtype".format(name=self.name) + self.dtype_attr = f"{self.name}_dtype" self.meta = meta - self.meta_attr = "{name}_meta".format(name=self.name) + self.meta_attr = f"{self.name}_meta" self.set_data(data) self.set_metadata(metadata) @@ -2060,7 +2078,7 @@ def __repr__(self) -> str: ) return ",".join( ( - "{key}->{value}".format(key=key, value=value) + f"{key}->{value}" for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) ) ) @@ -2114,11 +2132,7 @@ def set_kind(self): elif dtype.startswith("bool"): self.kind = "bool" else: - raise AssertionError( - "cannot interpret dtype of [{dtype}] in [{obj}]".format( - dtype=dtype, obj=self - ) - ) + raise AssertionError(f"cannot interpret dtype of [{dtype}] in [{self}]") # set my typ if we need if self.typ is None: @@ -2209,10 +2223,8 @@ def set_atom_string( inferred_type = lib.infer_dtype(col.ravel(), skipna=False) if inferred_type != "string": raise TypeError( - "Cannot serialize the column [{item}] because\n" - "its data contents are [{type}] object dtype".format( - item=item, type=inferred_type - ) + f"Cannot serialize the column [{item}] because\n" + f"its data contents are [{inferred_type}] object dtype" ) # itemsize is the maximum length of a string (along any dimension) @@ -2235,18 +2247,18 @@ def set_atom_string( self.itemsize = itemsize self.kind = "string" self.typ = self.get_atom_string(block, itemsize) - self.set_data( - data_converted.astype("|S{size}".format(size=itemsize), copy=False) - ) + self.set_data(data_converted.astype(f"|S{itemsize}", copy=False)) def get_atom_coltype(self, kind=None): """ return the PyTables column class for this column """ if kind is None: kind = self.kind if self.kind.startswith("uint"): - col_name = "UInt{name}Col".format(name=kind[4:]) + k4 = kind[4:] + col_name = f"UInt{k4}Col" else: - col_name = "{name}Col".format(name=kind.capitalize()) + kcap = kind.capitalize() + col_name = f"{kcap}Col" return getattr(_tables(), col_name) @@ -2426,8 +2438,6 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): self.data, nan_rep=nan_rep, encoding=encoding, errors=errors ) - return self - def get_attr(self): """ get the data for this column """ self.values = getattr(self.attrs, self.kind_attr, None) @@ -2450,6 +2460,7 @@ class DataIndexableCol(DataCol): def validate_names(self): if not Index(self.values).is_object(): + # TODO: should the message here be more specifically non-str? raise ValueError("cannot have non-object label DataIndexableCol") def get_atom_string(self, block, itemsize): @@ -2484,9 +2495,9 @@ class Fixed: group : the group node where the table resides """ - pandas_kind = None # type: str - obj_type = None # type: Type[Union[DataFrame, Series]] - ndim = None # type: int + pandas_kind: str + obj_type: Type[Union[DataFrame, Series]] + ndim: int is_table = False def __init__(self, parent, group, encoding=None, errors="strict", **kwargs): @@ -2494,28 +2505,29 @@ def __init__(self, parent, group, encoding=None, errors="strict", **kwargs): self.group = group self.encoding = _ensure_encoding(encoding) self.errors = errors - self.set_version() @property - def is_old_version(self): + def is_old_version(self) -> bool: return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 - def set_version(self): + @property + def version(self) -> Tuple[int, int, int]: """ compute and set our version """ version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) try: - self.version = tuple(int(x) for x in version.split(".")) - if len(self.version) == 2: - self.version = self.version + (0,) + version = tuple(int(x) for x in version.split(".")) + if len(version) == 2: + version = version + (0,) except AttributeError: - self.version = (0, 0, 0) + version = (0, 0, 0) + return version @property def pandas_type(self): return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) @property - def format_type(self): + def format_type(self) -> str: return "fixed" def __repr__(self) -> str: @@ -2524,17 +2536,15 @@ def __repr__(self) -> str: s = self.shape if s is not None: if isinstance(s, (list, tuple)): - s = "[{shape}]".format(shape=",".join(pprint_thing(x) for x in s)) - return "{type:12.12} (shape->{shape})".format( - type=self.pandas_type, shape=s - ) + jshape = ",".join(pprint_thing(x) for x in s) + s = f"[{jshape}]" + return f"{self.pandas_type:12.12} (shape->{s})" return self.pandas_type def set_object_info(self): """ set my pandas type & version """ self.attrs.pandas_type = str(self.pandas_kind) self.attrs.pandas_version = str(_version) - self.set_version() def copy(self): new_self = copy.copy(self) @@ -2590,7 +2600,7 @@ def storable(self): return self.group @property - def is_exists(self): + def is_exists(self) -> bool: return False @property @@ -2644,10 +2654,10 @@ class GenericFixed(Fixed): _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} _reverse_index_map = {v: k for k, v in _index_type_map.items()} - attributes = [] # type: List[str] + attributes: List[str] = [] # indexer helpders - def _class_to_alias(self, cls): + def _class_to_alias(self, cls) -> str: return self._index_type_map.get(cls, "") def _alias_to_class(self, alias): @@ -2676,7 +2686,7 @@ def f(values, freq=None, tz=None): return klass - def validate_read(self, kwargs): + def validate_read(self, kwargs: Dict[str, Any]) -> Dict[str, Any]: """ remove table keywords from kwargs and return raise if any keywords are passed which are not-None @@ -2700,7 +2710,7 @@ def validate_read(self, kwargs): return kwargs @property - def is_exists(self): + def is_exists(self) -> bool: return True def set_attrs(self): @@ -2718,7 +2728,9 @@ def get_attrs(self): def write(self, obj, **kwargs): self.set_attrs() - def read_array(self, key, start=None, stop=None): + def read_array( + self, key: str, start: Optional[int] = None, stop: Optional[int] = None + ): """ read an array for the specified node (off of group """ import tables @@ -2753,7 +2765,7 @@ def read_array(self, key, start=None, stop=None): return ret def read_index(self, key, **kwargs): - variety = _ensure_decoded(getattr(self.attrs, "{key}_variety".format(key=key))) + variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) if variety == "multi": return self.read_multi_index(key, **kwargs) @@ -2765,25 +2777,23 @@ def read_index(self, key, **kwargs): _, index = self.read_index_node(getattr(self.group, key), **kwargs) return index else: # pragma: no cover - raise TypeError( - "unrecognized index variety: {variety}".format(variety=variety) - ) + raise TypeError(f"unrecognized index variety: {variety}") def write_index(self, key, index): if isinstance(index, MultiIndex): - setattr(self.attrs, "{key}_variety".format(key=key), "multi") + setattr(self.attrs, f"{key}_variety", "multi") self.write_multi_index(key, index) elif isinstance(index, BlockIndex): - setattr(self.attrs, "{key}_variety".format(key=key), "block") + setattr(self.attrs, f"{key}_variety", "block") self.write_block_index(key, index) elif isinstance(index, IntIndex): - setattr(self.attrs, "{key}_variety".format(key=key), "sparseint") + setattr(self.attrs, f"{key}_variety", "sparseint") self.write_sparse_intindex(key, index) else: - setattr(self.attrs, "{key}_variety".format(key=key), "regular") + setattr(self.attrs, f"{key}_variety", "regular") converted = _convert_index( - index, self.encoding, self.errors, self.format_type - ).set_name("index") + "index", index, self.encoding, self.errors, self.format_type + ) self.write_array(key, converted.values) @@ -2801,27 +2811,27 @@ def write_index(self, key, index): node._v_attrs.tz = _get_tz(index.tz) def write_block_index(self, key, index): - self.write_array("{key}_blocs".format(key=key), index.blocs) - self.write_array("{key}_blengths".format(key=key), index.blengths) - setattr(self.attrs, "{key}_length".format(key=key), index.length) - - def read_block_index(self, key, **kwargs): - length = getattr(self.attrs, "{key}_length".format(key=key)) - blocs = self.read_array("{key}_blocs".format(key=key), **kwargs) - blengths = self.read_array("{key}_blengths".format(key=key), **kwargs) + self.write_array(f"{key}_blocs", index.blocs) + self.write_array(f"{key}_blengths", index.blengths) + setattr(self.attrs, f"{key}_length", index.length) + + def read_block_index(self, key, **kwargs) -> BlockIndex: + length = getattr(self.attrs, f"{key}_length") + blocs = self.read_array(f"{key}_blocs", **kwargs) + blengths = self.read_array(f"{key}_blengths", **kwargs) return BlockIndex(length, blocs, blengths) def write_sparse_intindex(self, key, index): - self.write_array("{key}_indices".format(key=key), index.indices) - setattr(self.attrs, "{key}_length".format(key=key), index.length) + self.write_array(f"{key}_indices", index.indices) + setattr(self.attrs, f"{key}_length", index.length) - def read_sparse_intindex(self, key, **kwargs): - length = getattr(self.attrs, "{key}_length".format(key=key)) - indices = self.read_array("{key}_indices".format(key=key), **kwargs) + def read_sparse_intindex(self, key, **kwargs) -> IntIndex: + length = getattr(self.attrs, f"{key}_length") + indices = self.read_array(f"{key}_indices", **kwargs) return IntIndex(length, indices) def write_multi_index(self, key, index): - setattr(self.attrs, "{key}_nlevels".format(key=key), index.nlevels) + setattr(self.attrs, f"{key}_nlevels", index.nlevels) for i, (lev, level_codes, name) in enumerate( zip(index.levels, index.codes, index.names) @@ -2831,35 +2841,35 @@ def write_multi_index(self, key, index): raise NotImplementedError( "Saving a MultiIndex with an extension dtype is not supported." ) - level_key = "{key}_level{idx}".format(key=key, idx=i) + level_key = f"{key}_level{i}" conv_level = _convert_index( - lev, self.encoding, self.errors, self.format_type - ).set_name(level_key) + level_key, lev, self.encoding, self.errors, self.format_type + ) self.write_array(level_key, conv_level.values) node = getattr(self.group, level_key) node._v_attrs.kind = conv_level.kind node._v_attrs.name = name # write the name - setattr(node._v_attrs, "{key}_name{name}".format(key=key, name=name), name) + setattr(node._v_attrs, f"{key}_name{name}", name) # write the labels - label_key = "{key}_label{idx}".format(key=key, idx=i) + label_key = f"{key}_label{i}" self.write_array(label_key, level_codes) - def read_multi_index(self, key, **kwargs): - nlevels = getattr(self.attrs, "{key}_nlevels".format(key=key)) + def read_multi_index(self, key, **kwargs) -> MultiIndex: + nlevels = getattr(self.attrs, f"{key}_nlevels") levels = [] codes = [] names = [] for i in range(nlevels): - level_key = "{key}_level{idx}".format(key=key, idx=i) + level_key = f"{key}_level{i}" name, lev = self.read_index_node(getattr(self.group, level_key), **kwargs) levels.append(lev) names.append(name) - label_key = "{key}_label{idx}".format(key=key, idx=i) + label_key = f"{key}_label{i}" level_codes = self.read_array(label_key, **kwargs) codes.append(level_codes) @@ -2867,17 +2877,14 @@ def read_multi_index(self, key, **kwargs): levels=levels, codes=codes, names=names, verify_integrity=True ) - def read_index_node(self, node, start=None, stop=None): + def read_index_node( + self, node, start: Optional[int] = None, stop: Optional[int] = None + ): data = node[start:stop] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. - if "shape" in node._v_attrs and self._is_empty_array( - getattr(node._v_attrs, "shape") - ): - data = np.empty( - getattr(node._v_attrs, "shape"), - dtype=getattr(node._v_attrs, "value_type"), - ) + if "shape" in node._v_attrs and self._is_empty_array(node._v_attrs.shape): + data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,) kind = _ensure_decoded(node._v_attrs.kind) name = None @@ -2908,21 +2915,21 @@ def read_index_node(self, node, start=None, stop=None): data, kind, encoding=self.encoding, errors=self.errors ), dtype=object, - **kwargs + **kwargs, ) else: index = factory( _unconvert_index( data, kind, encoding=self.encoding, errors=self.errors ), - **kwargs + **kwargs, ) index.name = name return name, index - def write_array_empty(self, key, value): + def write_array_empty(self, key: str, value): """ write a 0-len array """ # ugly hack for length 0 axes @@ -2931,11 +2938,11 @@ def write_array_empty(self, key, value): getattr(self.group, key)._v_attrs.value_type = str(value.dtype) getattr(self.group, key)._v_attrs.shape = value.shape - def _is_empty_array(self, shape): + def _is_empty_array(self, shape) -> bool: """Returns true if any axis is zero length.""" return any(x == 0 for x in shape) - def write_array(self, key, value, items=None): + def write_array(self, key: str, value, items=None): if key in self.group: self._handle.remove_node(self.group, key) @@ -3020,33 +3027,6 @@ def write_array(self, key, value, items=None): getattr(self.group, key)._v_attrs.transposed = transposed -class LegacyFixed(GenericFixed): - def read_index_legacy(self, key, start=None, stop=None): - node = getattr(self.group, key) - data = node[start:stop] - kind = node._v_attrs.kind - return _unconvert_index_legacy( - data, kind, encoding=self.encoding, errors=self.errors - ) - - -class LegacySeriesFixed(LegacyFixed): - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index_legacy("index") - values = self.read_array("values") - return Series(values, index=index) - - -class LegacyFrameFixed(LegacyFixed): - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index_legacy("index") - columns = self.read_index_legacy("columns") - values = self.read_array("values") - return DataFrame(values, index=index, columns=columns) - - class SeriesFixed(GenericFixed): pandas_kind = "series" attributes = ["name"] @@ -3054,7 +3034,7 @@ class SeriesFixed(GenericFixed): @property def shape(self): try: - return (len(getattr(self.group, "values")),) + return (len(self.group.values),) except (TypeError, AttributeError): return None @@ -3083,13 +3063,13 @@ def shape(self): # items items = 0 for i in range(self.nblocks): - node = getattr(self.group, "block{idx}_items".format(idx=i)) + node = getattr(self.group, f"block{i}_items") shape = getattr(node, "shape", None) if shape is not None: items += shape[0] # data shape - node = getattr(self.group, "block0_values") + node = self.group.block0_values shape = getattr(node, "shape", None) if shape is not None: shape = list(shape[0 : (ndim - 1)]) @@ -3116,17 +3096,15 @@ def read(self, start=None, stop=None, **kwargs): for i in range(self.ndim): _start, _stop = (start, stop) if i == select_axis else (None, None) - ax = self.read_index("axis{idx}".format(idx=i), start=_start, stop=_stop) + ax = self.read_index(f"axis{i}", start=_start, stop=_stop) axes.append(ax) items = axes[0] blocks = [] for i in range(self.nblocks): - blk_items = self.read_index("block{idx}_items".format(idx=i)) - values = self.read_array( - "block{idx}_values".format(idx=i), start=_start, stop=_stop - ) + blk_items = self.read_index(f"block{i}_items") + values = self.read_array(f"block{i}_values", start=_start, stop=_stop) blk = make_block( values, placement=items.get_indexer(blk_items), ndim=len(axes) ) @@ -3145,17 +3123,15 @@ def write(self, obj, **kwargs): if i == 0: if not ax.is_unique: raise ValueError("Columns index has to be unique for fixed format") - self.write_index("axis{idx}".format(idx=i), ax) + self.write_index(f"axis{i}", ax) # Supporting mixed-type DataFrame objects...nontrivial self.attrs.nblocks = len(data.blocks) for i, blk in enumerate(data.blocks): # I have no idea why, but writing values before items fixed #2299 blk_items = data.items.take(blk.mgr_locs) - self.write_array( - "block{idx}_values".format(idx=i), blk.values, items=blk_items - ) - self.write_index("block{idx}_items".format(idx=i), blk_items) + self.write_array(f"block{i}_values", blk.values, items=blk_items) + self.write_index(f"block{i}_items", blk_items) class FrameFixed(BlockManagerFixed): @@ -3189,7 +3165,7 @@ class Table(Fixed): """ pandas_kind = "wide_table" - table_type = None # type: str + table_type: str levels = 1 is_table = True is_shape_reversed = False @@ -3206,35 +3182,29 @@ def __init__(self, *args, **kwargs): self.selection = None @property - def table_type_short(self): + def table_type_short(self) -> str: return self.table_type.split("_")[0] @property - def format_type(self): + def format_type(self) -> str: return "table" def __repr__(self) -> str: """ return a pretty representation of myself """ self.infer_axes() - dc = ",dc->[{columns}]".format( - columns=(",".join(self.data_columns) if len(self.data_columns) else "") - ) + jdc = ",".join(self.data_columns) if len(self.data_columns) else "" + dc = f",dc->[{jdc}]" ver = "" if self.is_old_version: - ver = "[{version}]".format(version=".".join(str(x) for x in self.version)) + jver = ".".join(str(x) for x in self.version) + ver = f"[{jver}]" + jindex_axes = ",".join(a.name for a in self.index_axes) return ( - "{pandas_type:12.12}{ver} (typ->{table_type},nrows->{nrows}," - "ncols->{ncols},indexers->[{index_axes}]{dc})".format( - pandas_type=self.pandas_type, - ver=ver, - table_type=self.table_type_short, - nrows=self.nrows, - ncols=self.ncols, - index_axes=(",".join(a.name for a in self.index_axes)), - dc=dc, - ) + f"{self.pandas_type:12.12}{ver} " + f"(typ->{self.table_type_short},nrows->{self.nrows}," + f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})" ) def __getitem__(self, c): @@ -3252,9 +3222,7 @@ def validate(self, other): if other.table_type != self.table_type: raise TypeError( "incompatible table_type with existing " - "[{other} - {self}]".format( - other=other.table_type, self=self.table_type - ) + f"[{other.table_type} - {self.table_type}]" ) for c in ["index_axes", "non_index_axes", "values_axes"]: @@ -3267,20 +3235,18 @@ def validate(self, other): oax = ov[i] if sax != oax: raise ValueError( - "invalid combinate of [{c}] on appending data " - "[{sax}] vs current table [{oax}]".format( - c=c, sax=sax, oax=oax - ) + f"invalid combinate of [{c}] on appending data " + f"[{sax}] vs current table [{oax}]" ) # should never get here raise Exception( - "invalid combinate of [{c}] on appending data [{sv}] vs " - "current table [{ov}]".format(c=c, sv=sv, ov=ov) + f"invalid combinate of [{c}] on appending data [{sv}] vs " + f"current table [{ov}]" ) @property - def is_multi_index(self): + def is_multi_index(self) -> bool: """the levels attribute is 1 or a list in the case of a multi-index""" return isinstance(self.levels, list) @@ -3293,8 +3259,7 @@ def validate_multiindex(self, obj): new object """ levels = [ - l if l is not None else "level_{0}".format(i) - for i, l in enumerate(obj.index.names) + l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names) ] try: return obj.reset_index(), levels @@ -3304,12 +3269,12 @@ def validate_multiindex(self, obj): ) @property - def nrows_expected(self): + def nrows_expected(self) -> int: """ based on our axes, compute the expected nrows """ return np.prod([i.cvalues.shape[0] for i in self.index_axes]) @property - def is_exists(self): + def is_exists(self) -> bool: """ has this table been created """ return "table" in self.group @@ -3335,12 +3300,12 @@ def axes(self): return itertools.chain(self.index_axes, self.values_axes) @property - def ncols(self): + def ncols(self) -> int: """ the number of total columns in the values axes """ return sum(len(a.values) for a in self.values_axes) @property - def is_transposed(self): + def is_transposed(self) -> bool: return False @property @@ -3353,7 +3318,7 @@ def data_orientation(self): ) ) - def queryables(self): + def queryables(self) -> Dict[str, Any]: """ return a dict of the kinds allowable columns for this object """ # compute the values_axes queryables @@ -3372,25 +3337,26 @@ def queryables(self): def index_cols(self): """ return a list of my index cols """ + # Note: each `i.cname` below is assured to be a str. return [(i.axis, i.cname) for i in self.index_axes] - def values_cols(self): + def values_cols(self) -> List[str]: """ return a list of my values cols """ return [i.cname for i in self.values_axes] - def _get_metadata_path(self, key): + def _get_metadata_path(self, key) -> str: """ return the metadata pathname for this key """ - return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, key=key) + group = self.group._v_pathname + return f"{group}/meta/{key}/meta" - def write_metadata(self, key, values): + def write_metadata(self, key: str, values): """ write out a meta data array to the key as a fixed-format Series Parameters ---------- - key : string + key : str values : ndarray - """ values = Series(values) self.parent.put( @@ -3402,16 +3368,12 @@ def write_metadata(self, key, values): nan_rep=self.nan_rep, ) - def read_metadata(self, key): + def read_metadata(self, key: str): """ return the meta data array for this key """ if getattr(getattr(self.group, "meta", None), key, None) is not None: return self.parent.select(self._get_metadata_path(key)) return None - def set_info(self): - """ update our table index info """ - self.attrs.info = self.info - def set_attrs(self): """ set our table type & indexables """ self.attrs.table_type = str(self.table_type) @@ -3424,7 +3386,7 @@ def set_attrs(self): self.attrs.errors = self.errors self.attrs.levels = self.levels self.attrs.metadata = self.metadata - self.set_info() + self.attrs.info = self.info def get_attrs(self): """ retrieve our attributes """ @@ -3465,8 +3427,8 @@ def validate_min_itemsize(self, min_itemsize): continue if k not in q: raise ValueError( - "min_itemsize has the key [{key}] which is not an axis or " - "data_column".format(key=k) + f"min_itemsize has the key [{k}] which is not an axis or " + "data_column" ) @property @@ -3476,6 +3438,8 @@ def indexables(self): self._indexables = [] + # Note: each of the `name` kwargs below are str, ensured + # by the definition in index_cols. # index columns self._indexables.extend( [ @@ -3489,6 +3453,7 @@ def indexables(self): base_pos = len(self._indexables) def f(i, c): + assert isinstance(c, str) klass = DataCol if c in dc: klass = DataIndexableCol @@ -3496,6 +3461,8 @@ def f(i, c): i=i, name=c, pos=base_pos + i, version=self.version ) + # Note: the definition of `values_cols` ensures that each + # `c` below is a str. self._indexables.extend( [f(i, c) for i, c in enumerate(self.attrs.values_cols)] ) @@ -3572,9 +3539,19 @@ def create_index(self, columns=None, optlevel=None, kind=None): ) v.create_index(**kw) - def read_axes(self, where, **kwargs): - """create and return the axes sniffed from the table: return boolean - for success + def read_axes(self, where, **kwargs) -> bool: + """ + Create the axes sniffed from the table. + + Parameters + ---------- + where : ??? + **kwargs + + Returns + ------- + bool + Indicates success. """ # validate the version @@ -3620,8 +3597,8 @@ def validate_data_columns(self, data_columns, min_itemsize): info = self.info.get(axis, dict()) if info.get("type") == "MultiIndex" and data_columns: raise ValueError( - "cannot use a multi-index on axis [{0}] with " - "data_columns {1}".format(axis, data_columns) + f"cannot use a multi-index on axis [{axis}] with " + f"data_columns {data_columns}" ) # evaluate the passed data_columns, True == use all columns @@ -3650,11 +3627,11 @@ def create_axes( self, axes, obj, - validate=True, + validate: bool = True, nan_rep=None, data_columns=None, min_itemsize=None, - **kwargs + **kwargs, ): """ create and return the axes legacy tables create an indexable column, indexable index, @@ -3680,9 +3657,10 @@ def create_axes( try: axes = _AXES_MAP[type(obj)] except KeyError: + group = self.group._v_name raise TypeError( - "cannot properly create the storer for: [group->{group}," - "value->{value}]".format(group=self.group._v_name, value=type(obj)) + f"cannot properly create the storer for: [group->{group}," + f"value->{type(obj)}]" ) # map axes to numbers @@ -3723,11 +3701,12 @@ def create_axes( if i in axes: name = obj._AXIS_NAMES[i] - index_axes_map[i] = ( - _convert_index(a, self.encoding, self.errors, self.format_type) - .set_name(name) - .set_axis(i) + new_index = _convert_index( + name, a, self.encoding, self.errors, self.format_type ) + new_index.axis = i + index_axes_map[i] = new_index + else: # we might be able to change the axes on the appending data if @@ -3754,10 +3733,12 @@ def create_axes( self.non_index_axes.append((i, append_axis)) # set axis positions (based on the axes) - self.index_axes = [ - index_axes_map[a].set_pos(j).update_info(self.info) - for j, a in enumerate(axes) - ] + new_index_axes = [index_axes_map[a] for a in axes] + for j, iax in enumerate(new_index_axes): + iax.set_pos(j) + iax.update_info(self.info) + self.index_axes = new_index_axes + j = len(self.index_axes) # check for column conflicts @@ -3805,11 +3786,10 @@ def get_blk_items(mgr, blocks): new_blocks.append(b) new_blk_items.append(b_items) except (IndexError, KeyError): + jitems = ",".join(pprint_thing(item) for item in items) raise ValueError( - "cannot match existing table structure for [{items}] " - "on appending data".format( - items=(",".join(pprint_thing(item) for item in items)) - ) + f"cannot match existing table structure for [{jitems}] " + "on appending data" ) blocks = new_blocks blk_items = new_blk_items @@ -3826,6 +3806,9 @@ def get_blk_items(mgr, blocks): if data_columns and len(b_items) == 1 and b_items[0] in data_columns: klass = DataIndexableCol name = b_items[0] + if not (name is None or isinstance(name, str)): + # TODO: should the message here be more specifically non-str? + raise ValueError("cannot have non-object label DataIndexableCol") self.data_columns.append(name) # make sure that we match up the existing columns @@ -3835,38 +3818,27 @@ def get_blk_items(mgr, blocks): existing_col = existing_table.values_axes[i] except (IndexError, KeyError): raise ValueError( - "Incompatible appended table [{blocks}]" - "with existing table [{table}]".format( - blocks=blocks, table=existing_table.values_axes - ) + f"Incompatible appended table [{blocks}]" + f"with existing table [{existing_table.values_axes}]" ) else: existing_col = None - try: - col = klass.create_for_block(i=i, name=name, version=self.version) - col.set_atom( - block=b, - block_items=b_items, - existing_col=existing_col, - min_itemsize=min_itemsize, - nan_rep=nan_rep, - encoding=self.encoding, - errors=self.errors, - info=self.info, - ) - col.set_pos(j) + col = klass.create_for_block(i=i, name=name, version=self.version) + col.set_atom( + block=b, + block_items=b_items, + existing_col=existing_col, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + encoding=self.encoding, + errors=self.errors, + info=self.info, + ) + col.set_pos(j) + + self.values_axes.append(col) - self.values_axes.append(col) - except (NotImplementedError, ValueError, TypeError) as e: - raise e - except Exception as detail: - raise Exception( - "cannot find the correct atom type -> " - "[dtype->{name},items->{items}] {detail!s}".format( - name=b.dtype.name, items=b_items, detail=detail - ) - ) j += 1 # validate our min_itemsize @@ -3931,17 +3903,18 @@ def process_filter(field, filt): takers = op(values, filt) return obj.loc(axis=axis_number)[takers] - raise ValueError( - "cannot find the field [{field}] for " - "filtering!".format(field=field) - ) + raise ValueError(f"cannot find the field [{field}] for filtering!") obj = process_filter(field, filt) return obj def create_description( - self, complib=None, complevel=None, fletcher32=False, expectedrows=None + self, + complib=None, + complevel=None, + fletcher32: bool = False, + expectedrows: Optional[int] = None, ): """ create the description of the table from the axes & values """ @@ -3968,7 +3941,13 @@ def create_description( return d - def read_coordinates(self, where=None, start=None, stop=None, **kwargs): + def read_coordinates( + self, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + **kwargs, + ): """select coordinates (row numbers) from a table; return the coordinates object """ @@ -3981,7 +3960,7 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return False # create the selection - self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) + self.selection = Selection(self, where=where, start=start, stop=stop) coords = self.selection.select_coords() if self.selection.filter is not None: for field, op, filt in self.selection.filter.format(): @@ -3992,7 +3971,13 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return Index(coords) - def read_column(self, column, where=None, start=None, stop=None): + def read_column( + self, + column: str, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): """return a single column from the table, generally only indexables are interesting """ @@ -4013,28 +3998,22 @@ def read_column(self, column, where=None, start=None, stop=None): if not a.is_data_indexable: raise ValueError( - "column [{column}] can not be extracted individually; " - "it is not data indexable".format(column=column) + f"column [{column}] can not be extracted individually; " + "it is not data indexable" ) # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - return Series( - _set_tz( - a.convert( - c[start:stop], - nan_rep=self.nan_rep, - encoding=self.encoding, - errors=self.errors, - ).take_data(), - a.tz, - True, - ), - name=column, + a.convert( + c[start:stop], + nan_rep=self.nan_rep, + encoding=self.encoding, + errors=self.errors, ) + return Series(_set_tz(a.take_data(), a.tz, True), name=column) - raise KeyError("column [{column}] not found in the table".format(column=column)) + raise KeyError(f"column [{column}] not found in the table") class WORMTable(Table): @@ -4058,35 +4037,7 @@ def write(self, **kwargs): raise NotImplementedError("WORKTable needs to implement write") -class LegacyTable(Table): - """ an appendable table: allow append/query/delete operations to a - (possibly) already existing appendable table this table ALLOWS - append (but doesn't require them), and stores the data in a format - that can be easily searched - - """ - - _indexables = [ - IndexCol(name="index", axis=1, pos=0), - IndexCol(name="column", axis=2, pos=1, index_kind="columns_kind"), - DataCol(name="fields", cname="values", kind_attr="fields", pos=2), - ] # type: Optional[List[IndexCol]] - table_type = "legacy" - ndim = 3 - - def write(self, **kwargs): - raise TypeError("write operations are not allowed on legacy tables!") - - def read(self, where=None, columns=None, **kwargs): - """we have n indexable columns, with an arbitrary number of data - axes - """ - - if not self.read_axes(where=where, **kwargs): - return None - - -class AppendableTable(LegacyTable): +class AppendableTable(Table): """ support the new appendable table formats """ _indexables = None @@ -4104,7 +4055,7 @@ def write( chunksize=None, expectedrows=None, dropna=False, - **kwargs + **kwargs, ): if not append and self.is_exists: @@ -4138,7 +4089,7 @@ def write( # table = self.table # update my info - self.set_info() + self.attrs.info = self.info # validate the axes and set the kinds for a in self.axes: @@ -4147,7 +4098,7 @@ def write( # add the rows self.write_data(chunksize, dropna=dropna) - def write_data(self, chunksize, dropna=False): + def write_data(self, chunksize: Optional[int], dropna: bool = False): """ we form the data into a 2-d including indexes,values,mask write chunk-by-chunk """ @@ -4259,18 +4210,22 @@ def write_data_chunk(self, rows, indexes, mask, values): rows = rows[m] except Exception as detail: - raise Exception("cannot create row-data -> {detail}".format(detail=detail)) + raise Exception(f"cannot create row-data -> {detail}") try: if len(rows): self.table.append(rows) self.table.flush() except Exception as detail: - raise TypeError( - "tables cannot write this data -> {detail}".format(detail=detail) - ) + raise TypeError(f"tables cannot write this data -> {detail}") - def delete(self, where=None, start=None, stop=None, **kwargs): + def delete( + self, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + **kwargs, + ): # delete all rows (and return the nrows) if where is None or not len(where): @@ -4291,7 +4246,7 @@ def delete(self, where=None, start=None, stop=None, **kwargs): # create the selection table = self.table - self.selection = Selection(self, where, start=start, stop=stop, **kwargs) + self.selection = Selection(self, where, start=start, stop=stop) values = self.selection.select_coords() # delete the rows in reverse order @@ -4337,10 +4292,10 @@ class AppendableFrameTable(AppendableTable): pandas_kind = "frame_table" table_type = "appendable_frame" ndim = 2 - obj_type = DataFrame # type: Type[Union[DataFrame, Series]] + obj_type: Type[Union[DataFrame, Series]] = DataFrame @property - def is_transposed(self): + def is_transposed(self) -> bool: return self.index_axes[0].axis == 1 def get_object(self, obj): @@ -4411,7 +4366,7 @@ class AppendableSeriesTable(AppendableFrameTable): storage_obj_type = DataFrame @property - def is_transposed(self): + def is_transposed(self) -> bool: return False def get_object(self, obj): @@ -4469,7 +4424,7 @@ class GenericTable(AppendableFrameTable): obj_type = DataFrame @property - def pandas_type(self): + def pandas_type(self) -> str: return self.pandas_kind @property @@ -4499,6 +4454,7 @@ def indexables(self): self._indexables = [GenericIndexCol(name="index", axis=0)] for i, n in enumerate(d._v_names): + assert isinstance(n, str) dc = GenericDataIndexableCol( name=n, pos=i, values=[n], version=self.version @@ -4520,7 +4476,7 @@ class AppendableMultiFrameTable(AppendableFrameTable): _re_levels = re.compile(r"^level_\d+$") @property - def table_type_short(self): + def table_type_short(self) -> str: return "appendable_multi" def write(self, obj, data_columns=None, **kwargs): @@ -4547,7 +4503,7 @@ def read(self, **kwargs): return df -def _reindex_axis(obj, axis, labels, other=None): +def _reindex_axis(obj, axis: int, labels: Index, other=None): ax = obj._get_axis(axis) labels = ensure_index(labels) @@ -4562,7 +4518,7 @@ def _reindex_axis(obj, axis, labels, other=None): if other is not None: labels = ensure_index(other.unique()).intersection(labels, sort=False) if not labels.equals(ax): - slicer = [slice(None, None)] * obj.ndim + slicer: List[Union[slice, Index]] = [slice(None, None)] * obj.ndim slicer[axis] = labels obj = obj.loc[tuple(slicer)] return obj @@ -4588,7 +4544,7 @@ def _get_tz(tz): return zone -def _set_tz(values, tz, preserve_UTC=False, coerce=False): +def _set_tz(values, tz, preserve_UTC: bool = False, coerce: bool = False): """ coerce the values to a DatetimeIndex if tz is set preserve the input shape if possible @@ -4597,7 +4553,7 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): ---------- values : ndarray tz : string/pickled tz object - preserve_UTC : boolean, + preserve_UTC : bool, preserve the UTC of the result coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray """ @@ -4617,32 +4573,37 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): return values -def _convert_index(index, encoding=None, errors="strict", format_type=None): +def _convert_index(name: str, index, encoding=None, errors="strict", format_type=None): + assert isinstance(name, str) + index_name = getattr(index, "name", None) if isinstance(index, DatetimeIndex): converted = index.asi8 return IndexCol( + name, converted, "datetime64", _tables().Int64Col(), - freq=getattr(index, "freq", None), - tz=getattr(index, "tz", None), + freq=index.freq, + tz=index.tz, index_name=index_name, ) elif isinstance(index, TimedeltaIndex): converted = index.asi8 return IndexCol( + name, converted, "timedelta64", _tables().Int64Col(), - freq=getattr(index, "freq", None), + freq=index.freq, index_name=index_name, ) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() # avoid to store ndarray of Period objects return IndexCol( + name, index._ndarray_values, "integer", atom, @@ -4660,6 +4621,7 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): if inferred_type == "datetime64": converted = values.view("i8") return IndexCol( + name, converted, "datetime64", _tables().Int64Col(), @@ -4670,6 +4632,7 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): elif inferred_type == "timedelta64": converted = values.view("i8") return IndexCol( + name, converted, "timedelta64", _tables().Int64Col(), @@ -4682,11 +4645,13 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): dtype=np.float64, ) return IndexCol( - converted, "datetime", _tables().Time64Col(), index_name=index_name + name, converted, "datetime", _tables().Time64Col(), index_name=index_name ) elif inferred_type == "date": converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) - return IndexCol(converted, "date", _tables().Time32Col(), index_name=index_name) + return IndexCol( + name, converted, "date", _tables().Time32Col(), index_name=index_name, + ) elif inferred_type == "string": # atom = _tables().ObjectAtom() # return np.asarray(values, dtype='O'), 'object', atom @@ -4694,6 +4659,7 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): converted = _convert_string_array(values, encoding, errors) itemsize = converted.dtype.itemsize return IndexCol( + name, converted, "string", _tables().StringCol(itemsize), @@ -4704,29 +4670,39 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): if format_type == "fixed": atom = _tables().ObjectAtom() return IndexCol( - np.asarray(values, dtype="O"), "object", atom, index_name=index_name + name, + np.asarray(values, dtype="O"), + "object", + atom, + index_name=index_name, ) raise TypeError( - "[unicode] is not supported as a in index type for [{0}] formats".format( - format_type - ) + f"[unicode] is not supported as a in index type for [{format_type}] formats" ) elif inferred_type == "integer": # take a guess for now, hope the values fit atom = _tables().Int64Col() return IndexCol( - np.asarray(values, dtype=np.int64), "integer", atom, index_name=index_name + name, + np.asarray(values, dtype=np.int64), + "integer", + atom, + index_name=index_name, ) elif inferred_type == "floating": atom = _tables().Float64Col() return IndexCol( - np.asarray(values, dtype=np.float64), "float", atom, index_name=index_name + name, + np.asarray(values, dtype=np.float64), + "float", + atom, + index_name=index_name, ) else: # pragma: no cover atom = _tables().ObjectAtom() return IndexCol( - np.asarray(values, dtype="O"), "object", atom, index_name=index_name + name, np.asarray(values, dtype="O"), "object", atom, index_name=index_name, ) @@ -4752,22 +4728,7 @@ def _unconvert_index(data, kind, encoding=None, errors="strict"): elif kind == "object": index = np.asarray(data[0]) else: # pragma: no cover - raise ValueError("unrecognized index type {kind}".format(kind=kind)) - return index - - -def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, errors="strict"): - kind = _ensure_decoded(kind) - if kind == "datetime": - index = to_datetime(data) - elif kind in ("integer"): - index = np.asarray(data, dtype=object) - elif kind in ("string"): - index = _unconvert_string_array( - data, nan_rep=None, encoding=encoding, errors=errors - ) - else: # pragma: no cover - raise ValueError("unrecognized index type {kind}".format(kind=kind)) + raise ValueError(f"unrecognized index type {kind}") return index @@ -4799,7 +4760,7 @@ def _convert_string_array(data, encoding, errors, itemsize=None): ensured = ensure_object(data.ravel()) itemsize = max(1, libwriters.max_len_string_array(ensured)) - data = np.asarray(data, dtype="S{size}".format(size=itemsize)) + data = np.asarray(data, dtype=f"S{itemsize}") return data @@ -4828,7 +4789,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): if encoding is not None and len(data): itemsize = libwriters.max_len_string_array(ensure_object(data)) - dtype = "U{0}".format(itemsize) + dtype = f"U{itemsize}" if isinstance(data[0], bytes): data = Series(data).str.decode(encoding, errors=errors).values @@ -4842,7 +4803,8 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): return data.reshape(shape) -def _maybe_convert(values, val_kind, encoding, errors): +def _maybe_convert(values: np.ndarray, val_kind, encoding, errors): + val_kind = _ensure_decoded(val_kind) if _need_convert(val_kind): conv = _get_converter(val_kind, encoding, errors) # conv = np.frompyfunc(conv, 1, 1) @@ -4850,8 +4812,7 @@ def _maybe_convert(values, val_kind, encoding, errors): return values -def _get_converter(kind, encoding, errors): - kind = _ensure_decoded(kind) +def _get_converter(kind: str, encoding, errors): if kind == "datetime64": return lambda x: np.asarray(x, dtype="M8[ns]") elif kind == "datetime": @@ -4859,11 +4820,10 @@ def _get_converter(kind, encoding, errors): elif kind == "string": return lambda x: _unconvert_string_array(x, encoding=encoding, errors=errors) else: # pragma: no cover - raise ValueError("invalid kind {kind}".format(kind=kind)) + raise ValueError(f"invalid kind {kind}") -def _need_convert(kind): - kind = _ensure_decoded(kind) +def _need_convert(kind) -> bool: if kind in ("datetime", "datetime64", "string"): return True return False @@ -4881,7 +4841,13 @@ class Selection: """ - def __init__(self, table, where=None, start=None, stop=None): + def __init__( + self, + table: Table, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): self.table = table self.where = where self.start = start @@ -4932,20 +4898,19 @@ def generate(self, where): q = self.table.queryables() try: - return Expr(where, queryables=q, encoding=self.table.encoding) + return PyTablesExpr(where, queryables=q, encoding=self.table.encoding) except NameError: # raise a nice message, suggesting that the user should use # data_columns + qkeys = ",".join(q.keys()) raise ValueError( - "The passed where expression: {0}\n" + f"The passed where expression: {where}\n" " contains an invalid variable reference\n" " all of the variable references must be a " "reference to\n" " an axis (e.g. 'index' or 'columns'), or a " "data_column\n" - " The currently defined references are: {1}\n".format( - where, ",".join(q.keys()) - ) + f" The currently defined references are: {qkeys}\n" ) def select(self): diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index ea26a9b8efdbf..9aa8ed1dfeb5d 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -143,7 +143,7 @@ """ -def _parse_date(datestr): +def _parse_date(datestr: str) -> datetime: """ Given a date in xport format, return Python date. """ try: # e.g. "16FEB11:10:07:55" @@ -152,11 +152,11 @@ def _parse_date(datestr): return pd.NaT -def _split_line(s, parts): +def _split_line(s: str, parts): """ Parameters ---------- - s: string + s: str Fixed-length string to split parts: list of (name, length) pairs Used to break up string, name '_' will be filtered from output. @@ -402,7 +402,7 @@ def _read_header(self): def __next__(self): return self.read(nrows=self._chunksize or 1) - def _record_count(self): + def _record_count(self) -> int: """ Get number of records in file. @@ -482,7 +482,7 @@ def read(self, nrows=None): df = pd.DataFrame(index=range(read_lines)) for j, x in enumerate(self.columns): - vec = data["s%d" % j] + vec = data["s" + str(j)] ntype = self.fields[j]["ntype"] if ntype == "numeric": vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"]) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index e90e19649f645..684e602f06d12 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -361,7 +361,9 @@ def read_sql( or DBAPI2 connection (fallback mode) Using SQLAlchemy makes it possible to use any DB supported by that - library. If a DBAPI2 object, only sqlite3 is supported. + library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible + for engine disposal and connection closure for the SQLAlchemy connectable. See + `here `_ index_col : string or list of strings, optional, default: None Column(s) to set as index(MultiIndex). coerce_float : boolean, default True diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d51c9170c488b..567eeb7f5cdc8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -58,10 +58,6 @@ convert_categoricals : bool, default True Read value labels and convert columns to Categorical/Factor variables.""" -_encoding_params = """\ -encoding : str, None or encoding - Encoding used to parse the files. None defaults to latin-1.""" - _statafile_processing_params2 = """\ index_col : str, optional Column to set as index. @@ -108,7 +104,6 @@ %s %s %s -%s Returns ------- @@ -132,7 +127,6 @@ ... do_something(chunk) """ % ( _statafile_processing_params1, - _encoding_params, _statafile_processing_params2, _chunksize_params, _iterator_params, @@ -189,23 +183,19 @@ %s %s %s -%s """ % ( _statafile_processing_params1, _statafile_processing_params2, - _encoding_params, _chunksize_params, ) @Appender(_read_stata_doc) -@deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) @deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") def read_stata( filepath_or_buffer, convert_dates=True, convert_categoricals=True, - encoding=None, index_col=None, convert_missing=False, preserve_dtypes=True, @@ -614,7 +604,7 @@ def _cast_to_stata_types(data): data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) - if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53: + if data[col].max() >= 2 ** 53 or data[col].min() <= -(2 ** 53): ws = precision_loss_doc % ("int64", "float64") elif dtype in (np.float32, np.float64): value = data[col].max() @@ -1044,7 +1034,6 @@ def __init__(self): class StataReader(StataParser, BaseIterator): __doc__ = _stata_reader_doc - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) @deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") def __init__( self, @@ -1056,7 +1045,6 @@ def __init__( preserve_dtypes=True, columns=None, order_categoricals=True, - encoding=None, chunksize=None, ): super().__init__() @@ -2134,14 +2122,12 @@ class StataWriter(StataParser): _max_string_length = 244 - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) def __init__( self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, @@ -2640,7 +2626,7 @@ def _dtype_to_stata_type_117(dtype, column, force_strl): elif dtype == np.int8: return 65530 else: # pragma : no cover - raise NotImplementedError("Data type %s not supported." % dtype) + raise NotImplementedError(f"Data type {dtype} not supported.") def _pad_bytes_new(name, length): @@ -2859,8 +2845,6 @@ class StataWriter117(StataWriter): timezone information write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Only latin-1 and ascii are supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime @@ -2912,14 +2896,12 @@ class StataWriter117(StataWriter): _max_string_length = 2045 - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) def __init__( self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index ebe047c58b889..55c861e384d67 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -38,7 +38,6 @@ - hist_series and hist_frame (for `Series.hist` and `DataFrame.hist`) - boxplot (`pandas.plotting.boxplot(df)` equivalent to `DataFrame.boxplot`) - boxplot_frame and boxplot_frame_groupby -- tsplot (deprecated) - register and deregister (register converters for the tick formats) - Plots not called as `Series` and `DataFrame` methods: - table diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 6fc5b03920cba..beb276478070e 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -23,7 +23,7 @@ def hist_series( figsize=None, bins=10, backend=None, - **kwargs + **kwargs, ): """ Draw histogram of the input series using matplotlib. @@ -83,7 +83,7 @@ def hist_series( yrot=yrot, figsize=figsize, bins=bins, - **kwargs + **kwargs, ) @@ -103,7 +103,7 @@ def hist_frame( layout=None, bins=10, backend=None, - **kwargs + **kwargs, ): """ Make a histogram of the DataFrame's. @@ -206,7 +206,7 @@ def hist_frame( figsize=figsize, layout=layout, bins=bins, - **kwargs + **kwargs, ) @@ -400,7 +400,7 @@ def boxplot( figsize=None, layout=None, return_type=None, - **kwargs + **kwargs, ): plot_backend = _get_plot_backend("matplotlib") return plot_backend.boxplot( @@ -414,7 +414,7 @@ def boxplot( figsize=figsize, layout=layout, return_type=return_type, - **kwargs + **kwargs, ) @@ -432,7 +432,7 @@ def boxplot_frame( layout=None, return_type=None, backend=None, - **kwargs + **kwargs, ): plot_backend = _get_plot_backend(backend) return plot_backend.boxplot_frame( @@ -446,7 +446,7 @@ def boxplot_frame( figsize=figsize, layout=layout, return_type=return_type, - **kwargs + **kwargs, ) @@ -463,7 +463,7 @@ def boxplot_frame_groupby( sharex=False, sharey=True, backend=None, - **kwargs + **kwargs, ): """ Make box plots from DataFrameGroupBy data. @@ -536,7 +536,7 @@ def boxplot_frame_groupby( layout=layout, sharex=sharex, sharey=sharey, - **kwargs + **kwargs, ) @@ -736,26 +736,23 @@ def _get_call_args(backend_name, data, args, kwargs): ] else: raise TypeError( - ( - "Called plot accessor for type {}, expected Series or DataFrame" - ).format(type(data).__name__) + f"Called plot accessor for type {type(data).__name__}, " + "expected Series or DataFrame" ) if args and isinstance(data, ABCSeries): + positional_args = str(args)[1:-1] + keyword_args = ", ".join( + f"{name}={value!r}" for (name, default), value in zip(arg_def, args) + ) msg = ( "`Series.plot()` should not be called with positional " "arguments, only keyword arguments. The order of " "positional arguments will change in the future. " - "Use `Series.plot({})` instead of `Series.plot({})`." - ) - positional_args = str(args)[1:-1] - keyword_args = ", ".join( - "{}={!r}".format(name, value) - for (name, default), value in zip(arg_def, args) - ) - warnings.warn( - msg.format(keyword_args, positional_args), FutureWarning, stacklevel=3 + f"Use `Series.plot({keyword_args})` instead of " + f"`Series.plot({positional_args})`." ) + warnings.warn(msg, FutureWarning, stacklevel=3) pos_args = {name: value for value, (name, _) in zip(args, arg_def)} if backend_name == "pandas.plotting._matplotlib": @@ -776,8 +773,13 @@ def __call__(self, *args, **kwargs): ) kind = self._kind_aliases.get(kind, kind) + + # when using another backend, get out of the way + if plot_backend.__name__ != "pandas.plotting._matplotlib": + return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs) + if kind not in self._all_kinds: - raise ValueError("{} is not a valid plot kind".format(kind)) + raise ValueError(f"{kind} is not a valid plot kind") # The original data structured can be transformed before passed to the # backend. For example, for DataFrame is common to set the index as the @@ -791,14 +793,13 @@ def __call__(self, *args, **kwargs): if isinstance(data, ABCDataFrame): return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs) else: - raise ValueError( - ("plot kind {} can only be used for data frames").format(kind) - ) + raise ValueError(f"plot kind {kind} can only be used for data frames") elif kind in self._series_kinds: if isinstance(data, ABCDataFrame): if y is None and kwargs.get("subplots") is False: - msg = "{} requires either y column or 'subplots=True'" - raise ValueError(msg.format(kind)) + raise ValueError( + f"{kind} requires either y column or 'subplots=True'" + ) elif y is not None: if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] @@ -1634,12 +1635,11 @@ def _find_backend(backend: str): _backends[backend] = module return module - msg = ( - "Could not find plotting backend '{name}'. Ensure that you've installed the " - "package providing the '{name}' entrypoint, or that the package has a" + raise ValueError( + f"Could not find plotting backend '{backend}'. Ensure that you've installed " + f"the package providing the '{backend}' entrypoint, or that the package has a " "top-level `.plot` method." ) - raise ValueError(msg.format(name=backend)) def _get_plot_backend(backend=None): diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index cfd6c3519d82c..7bcca659ee3f6 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -74,9 +74,8 @@ def _validate_color_args(self): for key, values in self.color.items(): if key not in valid_keys: raise ValueError( - "color dict contains invalid " - "key '{0}' " - "The key must be either {1}".format(key, valid_keys) + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" ) else: self.color = None @@ -184,7 +183,7 @@ def _grouped_plot_by_column( ax=None, layout=None, return_type=None, - **kwargs + **kwargs, ): grouped = data.groupby(by) if columns is None: @@ -217,7 +216,7 @@ def _grouped_plot_by_column( result = axes byline = by[0] if len(by) == 1 else by - fig.suptitle("Boxplot grouped by {byline}".format(byline=byline)) + fig.suptitle(f"Boxplot grouped by {byline}") fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) return result @@ -234,7 +233,7 @@ def boxplot( figsize=None, layout=None, return_type=None, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -268,9 +267,8 @@ def _get_colors(): result[key_to_index[key]] = value else: raise ValueError( - "color dict contains invalid " - "key '{0}' " - "The key must be either {1}".format(key, valid_keys) + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" ) else: result.fill(colors) @@ -359,7 +357,7 @@ def boxplot_frame( figsize=None, layout=None, return_type=None, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -374,7 +372,7 @@ def boxplot_frame( figsize=figsize, layout=layout, return_type=return_type, - **kwds + **kwds, ) plt.draw_if_interactive() return ax @@ -392,7 +390,7 @@ def boxplot_frame_groupby( layout=None, sharex=False, sharey=True, - **kwds + **kwds, ): if subplots is True: naxes = len(grouped) @@ -432,6 +430,6 @@ def boxplot_frame_groupby( ax=ax, figsize=figsize, layout=layout, - **kwds + **kwds, ) return ret diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 946ce8bcec97f..feb895a099da5 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -125,7 +125,7 @@ def time2num(d): if isinstance(d, str): parsed = tools.to_datetime(d) if not isinstance(parsed, datetime): - raise ValueError("Could not parse time {d}".format(d=d)) + raise ValueError(f"Could not parse time {d}") return _to_ordinalf(parsed.time()) if isinstance(d, pydt.time): return _to_ordinalf(d) @@ -244,7 +244,7 @@ def get_datevalue(date, freq): return date elif date is None: return None - raise ValueError("Unrecognizable date '{date}'".format(date=date)) + raise ValueError(f"Unrecognizable date '{date}'") def _dt_to_float_ordinal(dt): @@ -421,15 +421,14 @@ def __call__(self): if estimate > self.MAXTICKS * 2: raise RuntimeError( - ( - "MillisecondLocator estimated to generate " - "{estimate:d} ticks from {dmin} to {dmax}: " - "exceeds Locator.MAXTICKS" - "* 2 ({arg:d}) " - ).format(estimate=estimate, dmin=dmin, dmax=dmax, arg=self.MAXTICKS * 2) + "MillisecondLocator estimated to generate " + f"{estimate:d} ticks from {dmin} to {dmax}: " + "exceeds Locator.MAXTICKS" + f"* 2 ({self.MAXTICKS * 2:d}) " ) - freq = "%dL" % self._get_interval() + interval = self._get_interval() + freq = f"{interval}L" tz = self.tz.tzname(None) st = _from_ordinal(dates.date2num(dmin)) # strip tz ed = _from_ordinal(dates.date2num(dmax)) @@ -581,7 +580,7 @@ def _daily_finder(vmin, vmax, freq): elif freq == FreqGroup.FR_HR: periodsperday = 24 else: # pragma: no cover - raise ValueError("unexpected frequency: {freq}".format(freq=freq)) + raise ValueError(f"unexpected frequency: {freq}") periodsperyear = 365 * periodsperday periodspermonth = 28 * periodsperday @@ -940,8 +939,7 @@ def get_finder(freq): elif (freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK: return _daily_finder else: # pragma: no cover - errmsg = "Unsupported frequency: {freq}".format(freq=freq) - raise NotImplementedError(errmsg) + raise NotImplementedError(f"Unsupported frequency: {freq}") class TimeSeries_DateLocator(Locator): @@ -1118,11 +1116,11 @@ def format_timedelta_ticks(x, pos, n_decimals): h, m = divmod(m, 60) d, h = divmod(h, 24) decimals = int(ns * 10 ** (n_decimals - 9)) - s = r"{:02d}:{:02d}:{:02d}".format(int(h), int(m), int(s)) + s = f"{int(h):02d}:{int(m):02d}:{int(s):02d}" if n_decimals > 0: - s += ".{{:0{:0d}d}}".format(n_decimals).format(decimals) + s += f".{decimals:0{n_decimals}d}" if d != 0: - s = "{:d} days ".format(int(d)) + s + s = f"{int(d):d} days {s}" return s def __call__(self, x, pos=0): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 5853367f71d56..f2efed30c48e8 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -57,7 +57,7 @@ def _kind(self): _layout_type = "vertical" _default_rot = 0 - orientation = None # type: Optional[str] + orientation: Optional[str] = None _pop_attributes = [ "label", "style", @@ -102,7 +102,7 @@ def __init__( table=False, layout=None, include_bool=False, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -349,8 +349,7 @@ def _setup_subplots(self): if input_log - valid_log: invalid_log = next(iter((input_log - valid_log))) raise ValueError( - "Boolean, None and 'sym' are valid options," - " '{}' is given.".format(invalid_log) + f"Boolean, None and 'sym' are valid options, '{invalid_log}' is given." ) if self.logx is True or self.loglog is True: @@ -501,14 +500,13 @@ def _adorn_subplots(self): if self.subplots: if is_list_like(self.title): if len(self.title) != self.nseries: - msg = ( + raise ValueError( "The length of `title` must equal the number " "of columns if using `title` of type `list` " "and `subplots=True`.\n" - "length of title = {}\n" - "number of columns = {}" - ).format(len(self.title), self.nseries) - raise ValueError(msg) + f"length of title = {len(self.title)}\n" + f"number of columns = {self.nseries}" + ) for (ax, title) in zip(self.axes, self.title): ax.set_title(title) @@ -813,11 +811,10 @@ def match_labels(data, e): or (err_shape[1] != 2) or (err_shape[2] != len(self.data)) ): - msg = ( + raise ValueError( "Asymmetrical error bars should be provided " - + "with the shape (%u, 2, %u)" % (self.nseries, len(self.data)) + f"with the shape ({self.nseries}, 2, {len(self.data)})" ) - raise ValueError(msg) # broadcast errors to each data series if len(err) == 1: @@ -827,7 +824,7 @@ def match_labels(data, e): err = np.tile([err], (self.nseries, len(self.data))) else: - msg = "No valid {label} detected".format(label=label) + msg = f"No valid {label} detected" raise ValueError(msg) return err @@ -985,7 +982,7 @@ def _make_plot(self): c=c_values, label=label, cmap=cmap, - **self.kwds + **self.kwds, ) if cb: cbar_label = c if c_is_column else "" @@ -1095,7 +1092,7 @@ def _make_plot(self): column_num=i, stacking_id=stacking_id, is_errorbar=is_errorbar, - **kwds + **kwds, ) self._add_legend_handle(newlines[0], label, index=i) @@ -1178,7 +1175,7 @@ def _get_stacked_values(cls, ax, stacking_id, values, label): raise ValueError( "When stacked is True, each column must be either " "all positive or negative." - "{0} contains both positive and negative values".format(label) + f"{label} contains both positive and negative values" ) @classmethod @@ -1250,7 +1247,7 @@ def _plot( column_num=None, stacking_id=None, is_errorbar=False, - **kwds + **kwds, ): if column_num == 0: @@ -1386,7 +1383,7 @@ def _make_plot(self): start=start, label=label, log=self.log, - **kwds + **kwds, ) ax.set_title(label) elif self.stacked: @@ -1401,7 +1398,7 @@ def _make_plot(self): start=start, label=label, log=self.log, - **kwds + **kwds, ) pos_prior = pos_prior + np.where(mask, y, 0) neg_prior = neg_prior + np.where(mask, 0, y) @@ -1415,7 +1412,7 @@ def _make_plot(self): start=start, label=label, log=self.log, - **kwds + **kwds, ) self._add_legend_handle(rect, label, index=i) @@ -1473,7 +1470,7 @@ class PiePlot(MPLPlot): def __init__(self, data, kind=None, **kwargs): data = data.fillna(value=0) if (data < 0).any().any(): - raise ValueError("{0} doesn't allow negative values".format(kind)) + raise ValueError(f"{kind} doesn't allow negative values") MPLPlot.__init__(self, data, kind=kind, **kwargs) def _args_adjust(self): diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index c4ac9ead3f3d3..b60e8fa8a3f7c 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -49,7 +49,7 @@ def _plot( bottom=0, column_num=0, stacking_id=None, - **kwds + **kwds, ): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) @@ -145,7 +145,7 @@ def _plot( ind=None, column_num=None, stacking_id=None, - **kwds + **kwds, ): from scipy.stats import gaussian_kde @@ -177,7 +177,7 @@ def _grouped_plot( layout=None, rot=0, ax=None, - **kwargs + **kwargs, ): if figsize == "default": @@ -226,7 +226,7 @@ def _grouped_hist( xrot=None, ylabelsize=None, yrot=None, - **kwargs + **kwargs, ): """ Grouped histogram @@ -290,7 +290,7 @@ def hist_series( yrot=None, figsize=None, bins=10, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -335,7 +335,7 @@ def hist_series( xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, - **kwds + **kwds, ) if hasattr(axes, "ndim"): @@ -359,7 +359,7 @@ def hist_frame( figsize=None, layout=None, bins=10, - **kwds + **kwds, ): if by is not None: axes = _grouped_hist( @@ -377,7 +377,7 @@ def hist_frame( xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, - **kwds + **kwds, ) return axes diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 6d5a94c4d5ff8..0720f544203f7 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -22,7 +22,7 @@ def scatter_matrix( density_kwds=None, hist_kwds=None, range_padding=0.05, - **kwds + **kwds, ): df = frame._get_numeric_data() n = df.columns.size @@ -160,7 +160,7 @@ def normalize(series): to_plot[kls][1], color=colors[i], label=pprint_thing(kls), - **kwds + **kwds, ) ax.legend() @@ -315,7 +315,7 @@ def parallel_coordinates( axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -395,7 +395,7 @@ def lag_plot(series, lag=1, ax=None, **kwds): if ax is None: ax = plt.gca() ax.set_xlabel("y(t)") - ax.set_ylabel("y(t + {lag})".format(lag=lag)) + ax.set_ylabel(f"y(t + {lag})") ax.scatter(y1, y2, **kwds) return ax diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 927b9cf4e392a..fd69265b18a5b 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -20,7 +20,7 @@ def _get_standard_colors( cmap = colormap colormap = cm.get_cmap(colormap) if colormap is None: - raise ValueError("Colormap {0} is not recognized".format(cmap)) + raise ValueError(f"Colormap {cmap} is not recognized") colors = [colormap(num) for num in np.linspace(0, 1, num=num_colors)] elif color is not None: if colormap is not None: diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 931c699d9b9fd..fa9585e1fc229 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -307,7 +307,8 @@ def _maybe_convert_index(ax, data): def _format_coord(freq, t, y): - return "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y) + time_period = Period(ordinal=int(t), freq=freq) + return f"t = {time_period} y = {y:8f}" def format_dateaxis(subplot, freq, index): diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index caa0167c06389..dd4034a97f58e 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -60,10 +60,7 @@ def _get_layout(nplots, layout=None, layout_type="box"): if nrows * ncols < nplots: raise ValueError( - "Layout of {nrows}x{ncols} must be larger " - "than required size {nplots}".format( - nrows=nrows, ncols=ncols, nplots=nplots - ) + f"Layout of {nrows}x{ncols} must be larger than required size {nplots}" ) return layout @@ -101,7 +98,7 @@ def _subplots( ax=None, layout=None, layout_type="box", - **fig_kw + **fig_kw, ): """Create a figure with a set of subplots already made. @@ -203,8 +200,8 @@ def _subplots( return fig, ax else: raise ValueError( - "The number of passed axes must be {0}, the " - "same as the output plot".format(naxes) + f"The number of passed axes must be {naxes}, the " + "same as the output plot" ) fig = ax.get_figure() diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 50b3b405692a5..1087d314b1bf7 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -1,5 +1,4 @@ from contextlib import contextmanager -import warnings from pandas.util._decorators import deprecate_kwarg @@ -82,7 +81,7 @@ def scatter_matrix( density_kwds=None, hist_kwds=None, range_padding=0.05, - **kwargs + **kwargs, ): """ Draw a matrix of scatter plots. @@ -134,7 +133,7 @@ def scatter_matrix( density_kwds=density_kwds, hist_kwds=hist_kwds, range_padding=range_padding, - **kwargs + **kwargs, ) @@ -207,7 +206,7 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): ax=ax, color=color, colormap=colormap, - **kwds + **kwds, ) @@ -255,7 +254,7 @@ def andrews_curves( samples=samples, color=color, colormap=colormap, - **kwargs + **kwargs, ) @@ -325,7 +324,7 @@ def parallel_coordinates( axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwargs + **kwargs, ): """ Parallel coordinates plotting. @@ -364,7 +363,7 @@ def parallel_coordinates( -------- >>> from matplotlib import pyplot as plt >>> df = pd.read_csv('https://raw.github.com/pandas-dev/pandas/master' - '/pandas/tests/data/iris.csv') + '/pandas/tests/data/csv/iris.csv') >>> pd.plotting.parallel_coordinates( df, 'Name', color=('#556270', '#4ECDC4', '#C7F464')) @@ -383,7 +382,7 @@ def parallel_coordinates( axvlines=axvlines, axvlines_kwds=axvlines_kwds, sort_labels=sort_labels, - **kwargs + **kwargs, ) @@ -426,33 +425,6 @@ def autocorrelation_plot(series, ax=None, **kwargs): return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwargs) -def tsplot(series, plotf, ax=None, **kwargs): - """ - Plots a Series on the given Matplotlib axes or the current axes - - Parameters - ---------- - axes : Axes - series : Series - - Notes - _____ - Supports same kwargs as Axes.plot - - - .. deprecated:: 0.23.0 - Use Series.plot() instead - """ - warnings.warn( - "'tsplot' is deprecated and will be removed in a " - "future version. Please use Series.plot() instead.", - FutureWarning, - stacklevel=2, - ) - plot_backend = _get_plot_backend("matplotlib") - return plot_backend.tsplot(series=series, plotf=plotf, ax=ax, **kwargs) - - class _Options(dict): """ Stores pandas plotting options. @@ -474,9 +446,7 @@ def __init__(self, deprecated=False): def __getitem__(self, key): key = self._get_canonical_key(key) if key not in self: - raise ValueError( - "{key} is not a valid pandas plotting option".format(key=key) - ) + raise ValueError(f"{key} is not a valid pandas plotting option") return super().__getitem__(key) def __setitem__(self, key, value): @@ -486,7 +456,7 @@ def __setitem__(self, key, value): def __delitem__(self, key): key = self._get_canonical_key(key) if key in self._DEFAULT_KEYS: - raise ValueError("Cannot remove default parameter {key}".format(key=key)) + raise ValueError(f"Cannot remove default parameter {key}") return super().__delitem__(key) def __contains__(self, key): diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 3a8e263ac2a6d..1282aa6edd538 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -13,7 +13,7 @@ def check(self, namespace, expected, ignored=None): result = sorted(f for f in dir(namespace) if not f.startswith("__")) if ignored is not None: - result = sorted(list(set(result) - set(ignored))) + result = sorted(set(result) - set(ignored)) expected = sorted(expected) tm.assert_almost_equal(result, expected) @@ -43,7 +43,7 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_modules = [] # type: List[str] + deprecated_modules: List[str] = [] # misc misc = ["IndexSlice", "NaT"] @@ -80,6 +80,7 @@ class TestPDApi(Base): "PeriodDtype", "IntervalDtype", "DatetimeTZDtype", + "BooleanDtype", "Int8Dtype", "Int16Dtype", "Int32Dtype", @@ -94,10 +95,10 @@ class TestPDApi(Base): classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) # these are already deprecated; awaiting removal - deprecated_classes = [] # type: List[str] + deprecated_classes: List[str] = [] # these should be deprecated in the future - deprecated_classes_in_future = [] # type: List[str] + deprecated_classes_in_future: List[str] = [] # external modules exposed in pandas namespace modules = ["np", "datetime"] @@ -173,10 +174,10 @@ class TestPDApi(Base): funcs_to = ["to_datetime", "to_msgpack", "to_numeric", "to_pickle", "to_timedelta"] # top-level to deprecate in the future - deprecated_funcs_in_future = [] # type: List[str] + deprecated_funcs_in_future: List[str] = [] # these are already deprecated; awaiting removal - deprecated_funcs = [] # type: List[str] + deprecated_funcs: List[str] = [] # private modules in pandas namespace private_modules = [ diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index e9f68692a9863..97480502f192c 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -50,7 +50,7 @@ class TestTypes(Base): "infer_dtype", "is_extension_array_dtype", ] - deprecated = ["is_period", "is_datetimetz", "is_extension_type"] + deprecated = ["is_extension_type"] dtypes = ["CategoricalDtype", "DatetimeTZDtype", "PeriodDtype", "IntervalDtype"] def test_types(self): diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 774ff14398bdb..1f8fdfd671856 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -21,7 +21,24 @@ def id_func(x): @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): - # zero-dim integer array behaves like an integer + """ + Several variants of integer value 1. The zero-dim integer array + behaves like an integer. + + This fixture can be used to check that datetimelike indexes handle + addition and subtraction of integers and zero-dimensional arrays + of integers. + + Examples + -------- + >>> dti = pd.date_range('2016-01-01', periods=2, freq='H') + >>> dti + DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00'], + dtype='datetime64[ns]', freq='H') + >>> dti + one + DatetimeIndex(['2016-01-01 01:00:00', '2016-01-01 02:00:00'], + dtype='datetime64[ns]', freq='H') + """ return request.param @@ -40,8 +57,21 @@ def one(request): @pytest.fixture(params=zeros) def zero(request): - # For testing division by (or of) zero for Index with length 5, this - # gives several scalar-zeros and length-5 vector-zeros + """ + Several types of scalar zeros and length 5 vectors of zeros. + + This fixture can be used to check that numeric-dtype indexes handle + division by any zero numeric-dtype. + + Uses vector of length 5 for broadcasting with `numeric_idx` fixture, + which creates numeric-dtype vectors also of length 5. + + Examples + -------- + >>> arr = pd.RangeIndex(5) + >>> arr / zeros + Float64Index([nan, inf, inf, inf, inf], dtype='float64') + """ return request.param diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index d239687a37757..1ba0930c06334 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -26,7 +26,9 @@ Timestamp, date_range, ) +import pandas.core.arrays.datetimelike as dtl from pandas.core.indexes.datetimes import _to_M8 +from pandas.core.ops import roperator import pandas.util.testing as tm @@ -102,19 +104,24 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) - def test_dt64arr_cmp_date_invalid(self, tz_naive_fixture, box_with_array): - # GH#19800, GH#19301 datetime.date comparison raises to - # match DatetimeIndex/Timestamp. This also matches the behavior - # of stdlib datetime.datetime - tz = tz_naive_fixture - - dti = pd.date_range("20010101", periods=10, tz=tz) - date = dti[0].to_pydatetime().date() - - dtarr = tm.box_expected(dti, box_with_array) - assert_invalid_comparison(dtarr, date, box_with_array) - - @pytest.mark.parametrize("other", ["foo", -1, 99, 4.0, object(), timedelta(days=2)]) + @pytest.mark.parametrize( + "other", + [ + "foo", + -1, + 99, + 4.0, + object(), + timedelta(days=2), + # GH#19800, GH#19301 datetime.date comparison raises to + # match DatetimeIndex/Timestamp. This also matches the behavior + # of stdlib datetime.datetime + datetime(2001, 1, 1).date(), + # GH#19301 None and NaN are *not* cast to NaT for comparisons + None, + np.nan, + ], + ) def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): # GH#22074, GH#15966 tz = tz_naive_fixture @@ -123,16 +130,6 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_arra dtarr = tm.box_expected(rng, box_with_array) assert_invalid_comparison(dtarr, other, box_with_array) - @pytest.mark.parametrize("other", [None, np.nan]) - def test_dt64arr_cmp_na_scalar_invalid( - self, other, tz_naive_fixture, box_with_array - ): - # GH#19301 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - dtarr = tm.box_expected(dti, box_with_array) - assert_invalid_comparison(dtarr, other, box_with_array) - def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly tz = tz_naive_fixture @@ -258,15 +255,10 @@ def test_nat_comparisons_scalar(self, dtype, data, box_with_array): tm.assert_equal(left >= NaT, expected) tm.assert_equal(NaT <= left, expected) - def test_series_comparison_scalars(self): + @pytest.mark.parametrize("val", [datetime(2000, 1, 4), datetime(2000, 1, 5)]) + def test_series_comparison_scalars(self, val): series = Series(date_range("1/1/2000", periods=10)) - val = datetime(2000, 1, 4) - result = series > val - expected = Series([x > val for x in series]) - tm.assert_series_equal(result, expected) - - val = series[5] result = series > val expected = Series([x > val for x in series]) tm.assert_series_equal(result, expected) @@ -1020,14 +1012,24 @@ def test_dt64arr_add_timestamp_raises(self, box_with_array): # ------------------------------------------------------------- # Other Invalid Addition/Subtraction - @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) - def test_dt64arr_add_sub_float(self, other, box_with_array): - dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") + @pytest.mark.parametrize( + "other", + [ + 3.14, + np.array([2.0, 3.0]), + # GH#13078 datetime +/- Period is invalid + pd.Period("2011-01-01", freq="D"), + ], + ) + @pytest.mark.parametrize("dti_freq", [None, "D"]) + def test_dt64arr_add_sub_invalid(self, dti_freq, other, box_with_array): + dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) dtarr = tm.box_expected(dti, box_with_array) msg = "|".join( [ "unsupported operand type", "cannot (add|subtract)", + "cannot use operands with types", "ufunc '?(add|subtract)'? cannot use operands with types", ] ) @@ -1068,24 +1070,6 @@ def test_dt64arr_add_sub_parr( with pytest.raises(TypeError, match=msg): parr - dtarr - @pytest.mark.parametrize("dti_freq", [None, "D"]) - def test_dt64arr_add_sub_period_scalar(self, dti_freq, box_with_array): - # GH#13078 - # not supported, check TypeError - per = pd.Period("2011-01-01", freq="D") - - idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) - dtarr = tm.box_expected(idx, box_with_array) - msg = "|".join(["unsupported operand type", "cannot (add|subtract)"]) - with pytest.raises(TypeError, match=msg): - dtarr + per - with pytest.raises(TypeError, match=msg): - per + dtarr - with pytest.raises(TypeError, match=msg): - dtarr - per - with pytest.raises(TypeError, match=msg): - per - dtarr - class TestDatetime64DateOffsetArithmetic: @@ -1406,7 +1390,7 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): s = tm.box_expected(s, box_with_array) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[dtl]): other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other @@ -1435,7 +1419,7 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[dtl]): res = dtarr + other expected = DatetimeIndex( [dti[n] + other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -1443,11 +1427,11 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[dtl]): res2 = other + dtarr tm.assert_equal(res2, expected) - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[dtl]): res = dtarr - other expected = DatetimeIndex( [dti[n] - other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -2168,16 +2152,16 @@ def test_dti_isub_tdi(self, tz_naive_fixture): ids=lambda x: type(x).__name__, ) @pytest.mark.parametrize("tz", [None, "US/Eastern"]) - def test_add_datetimelike_and_dti(self, addend, tz): + def test_add_datetimelike_and_dtarr(self, box_with_array, addend, tz): # GH#9631 dti = DatetimeIndex(["2011-01-01", "2011-01-02"]).tz_localize(tz) - msg = ( - "cannot add DatetimeArray and {0}".format(type(addend).__name__) - ).replace("DatetimeIndex", "DatetimeArray") + dtarr = tm.box_expected(dti, box_with_array) + msg = "cannot add DatetimeArray and" + with pytest.raises(TypeError, match=msg): - dti + addend + dtarr + addend with pytest.raises(TypeError, match=msg): - addend + dti + addend + dtarr # ------------------------------------------------------------- @@ -2257,13 +2241,6 @@ def test_timedelta64_equal_timedelta_supported_ops(self, op): intervals = ["D", "h", "m", "s", "us"] - # TODO: unused - # npy16_mappings = {'D': 24 * 60 * 60 * 1000000, - # 'h': 60 * 60 * 1000000, - # 'm': 60 * 1000000, - # 's': 1000000, - # 'us': 1} - def timedelta64(*args): # see casting notes in NumPy gh-12927 return np.sum(list(starmap(np.timedelta64, zip(args, intervals)))) @@ -2406,82 +2383,30 @@ def test_dti_add_series(self, tz, names): result4 = index + ser.values tm.assert_index_equal(result4, expected) + @pytest.mark.parametrize("other_box", [pd.Index, Series]) + @pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub]) @pytest.mark.parametrize( "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] ) - def test_dti_add_offset_index(self, tz_naive_fixture, names): + def test_dti_addsub_offset_arraylike(self, tz_naive_fixture, names, op, other_box): # GH#18849, GH#19744 - tz = tz_naive_fixture - dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res = dti + other - expected = DatetimeIndex( - [dti[n] + other[n] for n in range(len(dti))], name=names[2], freq="infer" - ) - tm.assert_index_equal(res, expected) - - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res2 = other + dti - tm.assert_index_equal(res2, expected) - - @pytest.mark.parametrize( - "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] - ) - def test_dti_sub_offset_index(self, tz_naive_fixture, names): - # GH#18824, GH#19744 - tz = tz_naive_fixture - dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res = dti - other - expected = DatetimeIndex( - [dti[n] - other[n] for n in range(len(dti))], name=names[2], freq="infer" - ) - tm.assert_index_equal(res, expected) + box = pd.Index + from .test_timedelta64 import get_upcast_box - @pytest.mark.parametrize( - "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] - ) - def test_dti_with_offset_series(self, tz_naive_fixture, names): - # GH#18849 tz = tz_naive_fixture dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) - other = Series([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - - expected_add = Series( - [dti[n] + other[n] for n in range(len(dti))], name=names[2] - ) + other = other_box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res = dti + other - tm.assert_series_equal(res, expected_add) + xbox = get_upcast_box(box, other) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res2 = other + dti - tm.assert_series_equal(res2, expected_add) + with tm.assert_produces_warning(PerformanceWarning, clear=[dtl]): + res = op(dti, other) - expected_sub = Series( - [dti[n] - other[n] for n in range(len(dti))], name=names[2] + expected = DatetimeIndex( + [op(dti[n], other[n]) for n in range(len(dti))], name=names[2], freq="infer" ) - - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res3 = dti - other - tm.assert_series_equal(res3, expected_sub) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(res, expected) @pytest.mark.parametrize("years", [-1, 0, 1]) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index ecb07fa49036a..d45daf9ab8433 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -896,11 +896,16 @@ def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture): result = other + idx tm.assert_equal(result, expected) - def test_td64arr_add_sub_timestamp(self, box_with_array): - # GH#11925 - ts = Timestamp("2012-01-01") - # TODO: parametrize over types of datetime scalar? - + @pytest.mark.parametrize( + "ts", + [ + Timestamp("2012-01-01"), + Timestamp("2012-01-01").to_pydatetime(), + Timestamp("2012-01-01").to_datetime64(), + ], + ) + def test_td64arr_add_sub_datetimelike_scalar(self, ts, box_with_array): + # GH#11925, GH#29558 tdi = timedelta_range("1 day", periods=3) expected = pd.date_range("2012-01-02", periods=3) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 1a48ccf85f947..e076015c5f61d 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -59,6 +59,24 @@ def test_isin_cats(): tm.assert_numpy_array_equal(expected, result) +@pytest.mark.parametrize( + "to_replace, value, result", + [("b", "c", ["a", "c"]), ("c", "d", ["a", "b"]), ("b", None, ["a", None])], +) +def test_replace(to_replace, value, result): + # GH 26988 + cat = pd.Categorical(["a", "b"]) + expected = pd.Categorical(result) + result = cat.replace(to_replace, value) + tm.assert_categorical_equal(result, expected) + if to_replace == "b": # the "c" test is supposed to be unchanged + with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original + tm.assert_categorical_equal(cat, expected) + cat.replace(to_replace, value, inplace=True) + tm.assert_categorical_equal(cat, expected) + + @pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])]) def test_isin_empty(empty): s = pd.Categorical(["a", "b"]) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 22c1d5373372a..d62c4f4cf936e 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -48,7 +48,7 @@ def test_comparisons(self): tm.assert_numpy_array_equal(result, expected) result = self.factor == "d" - expected = np.repeat(False, len(self.factor)) + expected = np.zeros(len(self.factor), dtype=bool) tm.assert_numpy_array_equal(result, expected) # comparisons with categoricals diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index f9bb4981df7df..755cbfb716fcd 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -658,12 +658,16 @@ def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) - res = sparse[4:,] # noqa: E231 + res = sparse[ + 4:, + ] # noqa: E231 exp = SparseArray(dense[4:,]) # noqa: E231 tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) - res = sparse[4:,] # noqa: E231 + res = sparse[ + 4:, + ] # noqa: E231 exp = SparseArray(dense[4:,], fill_value=0) # noqa: E231 tm.assert_sp_array_equal(res, exp) @@ -823,11 +827,11 @@ def test_nonzero(self): # Tests regression #21172. sa = pd.SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) expected = np.array([2, 5, 9], dtype=np.int32) - result, = sa.nonzero() + (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) sa = pd.SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) - result, = sa.nonzero() + (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index efe2b4e0b2deb..1ce62d8f8b3d9 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -171,3 +171,19 @@ def test_arrow_array(): arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array(["a", "b", None], dtype="string") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "string" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + tm.assert_frame_equal(result, df) + # ensure the missing value is represented by NaN and not None + assert np.isnan(result.loc[2, "a"]) diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py new file mode 100644 index 0000000000000..5cfc7c3837875 --- /dev/null +++ b/pandas/tests/arrays/test_boolean.py @@ -0,0 +1,509 @@ +import operator + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas.arrays import BooleanArray +from pandas.core.arrays.boolean import coerce_to_array +from pandas.tests.extension.base import BaseOpsUtil +import pandas.util.testing as tm + + +def make_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + +@pytest.fixture +def dtype(): + return pd.BooleanDtype() + + +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) + + +def test_boolean_array_constructor(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.tolist(), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, mask.tolist()) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.astype(int), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, None) + + with pytest.raises(ValueError, match="values must be a 1D array"): + BooleanArray(values.reshape(1, -1), mask) + + with pytest.raises(ValueError, match="mask must be a 1D array"): + BooleanArray(values, mask.reshape(1, -1)) + + +def test_boolean_array_constructor_copy(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = BooleanArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_boolean_array(): + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, False]) + ) + + result = pd.array([True, False, True], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, True]) + ) + + result = pd.array([True, False, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_all_none(): + expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) + + result = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([True, None], [True, np.nan]), + ([None], [np.nan]), + ([None, np.nan], [np.nan, np.nan]), + ([np.nan, np.nan], [np.nan, np.nan]), + ], +) +def test_to_boolean_array_none_is_nan(a, b): + result = pd.array(a, dtype="boolean") + expected = pd.array(b, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + # "foo", + [1, 2], + [1.0, 2.0], + pd.date_range("20130101", periods=2), + np.array(["foo"]), + [np.nan, {"a": 1}], + ], +) +def test_to_boolean_array_error(values): + # error in converting existing arrays to BooleanArray + with pytest.raises(TypeError): + pd.array(values, dtype="boolean") + + +def test_to_boolean_array_integer_like(): + # integers of 0's and 1's + result = pd.array([1, 0, 1, 0], dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array([1, 0, 1, None], dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_coerce_to_array(): + # TODO this is currently not public API + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is values + assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is not values + assert result._mask is not mask + + # mixed missing from values and mask + values = [True, False, None, False] + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray( + np.array([True, False, True, True]), np.array([False, False, True, True]) + ) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) + tm.assert_extension_array_equal(result, expected) + + # raise errors for wrong dimension + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + with pytest.raises(ValueError, match="values must be a 1D list-like"): + coerce_to_array(values.reshape(1, -1)) + + with pytest.raises(ValueError, match="mask must be a 1D list-like"): + coerce_to_array(values, mask=mask.reshape(1, -1)) + + +def test_coerce_to_array_from_boolean_array(): + # passing BooleanArray to coerce_to_array + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + arr = BooleanArray(values, mask) + result = BooleanArray(*coerce_to_array(arr)) + tm.assert_extension_array_equal(result, arr) + # no copy + assert result._data is arr._data + assert result._mask is arr._mask + + result = BooleanArray(*coerce_to_array(arr), copy=True) + tm.assert_extension_array_equal(result, arr) + assert result._data is not arr._data + assert result._mask is not arr._mask + + with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): + coerce_to_array(arr, mask=mask) + + +def test_coerce_to_numpy_array(): + # with missing values -> object dtype + arr = pd.array([True, False, None], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # also with no missing values -> object dtype + arr = pd.array([True, False, True], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # force bool dtype + result = np.array(arr, dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + # with missing values will raise error + arr = pd.array([True, False, None], dtype="boolean") + with pytest.raises(ValueError): + np.array(arr, dtype="bool") + + +def test_astype(): + # with missing values + arr = pd.array([True, False, None], dtype="boolean") + msg = "cannot convert float NaN to" + + with pytest.raises(ValueError, match=msg): + arr.astype("int64") + + with pytest.raises(ValueError, match=msg): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.astype("int64") + expected = np.array([1, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("boolean") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, arr) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("Int64") + expected = pd.array([1, 0, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] +) +def test_ufuncs_binary(ufunc): + # two BooleanArrays + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a, a) + expected = pd.array(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s, a) + expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + # Boolean with numpy array + arr = np.array([True, True, False]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a._data, arr), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # BooleanArray with scalar + result = ufunc(a, True) + expected = pd.array(ufunc(a._data, True), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(True, a) + expected = pd.array(ufunc(True, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # not handled types + with pytest.raises(TypeError): + ufunc(a, "test") + + +@pytest.mark.parametrize("ufunc", [np.logical_not]) +def test_ufuncs_unary(ufunc): + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a) + expected = pd.array(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("values", [[True, False], [True, None]]) +def test_ufunc_reduce_raises(values): + a = pd.array(values, dtype="boolean") + with pytest.raises(NotImplementedError): + np.add.reduce(a) + + +class TestLogicalOps(BaseOpsUtil): + def get_op_from_name(self, op_name): + short_opname = op_name.strip("_") + short_opname = short_opname if "xor" in short_opname else short_opname + "_" + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + + # fill the nan locations + expected[data._mask] = np.nan + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + expected = pd.Series(expected, dtype="boolean") + + # fill the nan locations + expected[data._mask] = np.nan + + tm.assert_series_equal(result, expected) + + def test_scalar(self, data, all_logical_operators): + op_name = all_logical_operators + self._compare_other(data, op_name, True) + + def test_array(self, data, all_logical_operators): + op_name = all_logical_operators + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + other = np.array([True] * len(data)) + self._compare_other(data, op_name, other) + other = pd.Series([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + + # fill the nan locations + expected[data._mask] = op_name == "__ne__" + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + expected = expected.astype("boolean") + + # fill the nan locations + expected[data._mask] = op_name == "__ne__" + + tm.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + self._compare_other(data, op_name, True) + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + other = np.array([True] * len(data)) + self._compare_other(data, op_name, other) + other = pd.Series([True] * len(data)) + self._compare_other(data, op_name, other) + + +class TestArithmeticOps(BaseOpsUtil): + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + with pytest.raises(TypeError): + ops("foo") + with pytest.raises(TypeError): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + if op not in ("__mul__", "__rmul__"): + # TODO(extension) numpy's mul with object array sees booleans as numbers + with pytest.raises(TypeError): + ops(pd.Series("foo", index=s.index)) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + with pytest.raises(NotImplementedError): + opa(np.arange(len(s)).reshape(-1, len(s))) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_reductions_return_types(dropna, data, all_numeric_reductions): + op = all_numeric_reductions + s = pd.Series(data) + if dropna: + s = s.dropna() + + if op in ("sum", "prod"): + assert isinstance(getattr(s, op)(), np.int64) + elif op in ("min", "max"): + assert isinstance(getattr(s, op)(), np.bool_) + else: + # "mean", "std", "var", "median", "kurt", "skew" + assert isinstance(getattr(s, op)(), np.float64) + + +# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion +# manually in the indexing code +# def test_indexing_boolean_mask(): +# arr = pd.array([1, 2, 3, 4], dtype="Int64") +# mask = pd.array([True, False, True, False], dtype="boolean") +# result = arr[mask] +# expected = pd.array([1, 3], dtype="Int64") +# tm.assert_extension_array_equal(result, expected) + +# # missing values -> error +# mask = pd.array([True, False, True, None], dtype="boolean") +# with pytest.raises(IndexError): +# result = arr[mask] + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + expected = pa.array(np.array(data, dtype=object), type=pa.bool_(), from_pandas=True) + assert arr.equals(expected) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 117a19acbfc3a..5cab0c1fe6d59 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -57,7 +57,7 @@ def timedelta_index(request): class SharedTests: - index_cls = None # type: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] + index_cls: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] def test_compare_len1_raises(self): # make sure we raise when comparing with different lengths, specific @@ -473,7 +473,15 @@ def test_strftime(self, datetime_index): arr = DatetimeArray(datetime_index) result = arr.strftime("%Y %b") - expected = np.array(datetime_index.strftime("%Y %b")) + expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_nat(self): + # GH 29578 + arr = DatetimeArray(DatetimeIndex(["2019-01-01", pd.NaT])) + + result = arr.strftime("%Y-%m-%d") + expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -679,7 +687,15 @@ def test_strftime(self, period_index): arr = PeriodArray(period_index) result = arr.strftime("%Y") - expected = np.array(period_index.strftime("%Y")) + expected = np.array([per.strftime("%Y") for per in arr], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_nat(self): + # GH 29578 + arr = PeriodArray(PeriodIndex(["2019-01-01", pd.NaT], dtype="period[D]")) + + result = arr.strftime("%Y-%m-%d") + expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 793de66767cc3..443a0c7e71616 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -829,6 +829,38 @@ def test_arrow_array(data): assert arr.equals(expected) +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(data): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + result = table.to_pandas() + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index a075521b67561..66e8e1bebfe98 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1889,11 +1889,11 @@ def test_invalid_parser(): pd.eval("x + y", local_dict={"x": 1, "y": 2}, parser="asdf") -_parsers = { +_parsers: Dict[str, Type[BaseExprVisitor]] = { "python": PythonExprVisitor, - "pytables": pytables.ExprVisitor, + "pytables": pytables.PyTablesExprVisitor, "pandas": PandasExprVisitor, -} # type: Dict[str, Type[BaseExprVisitor]] +} @pytest.mark.parametrize("engine", _engines) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 7abaa0651449e..912fce6339716 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -207,25 +207,6 @@ def test_is_categorical(): assert not com.is_categorical([1, 2, 3]) -def test_is_datetimetz(): - with tm.assert_produces_warning(FutureWarning): - assert not com.is_datetimetz([1, 2, 3]) - assert not com.is_datetimetz(pd.DatetimeIndex([1, 2, 3])) - - assert com.is_datetimetz(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) - - dtype = DatetimeTZDtype("ns", tz="US/Eastern") - s = pd.Series([], dtype=dtype) - assert com.is_datetimetz(s) - - -def test_is_period_deprecated(): - with tm.assert_produces_warning(FutureWarning): - assert not com.is_period([1, 2, 3]) - assert not com.is_period(pd.Index([1, 2, 3])) - assert com.is_period(pd.PeriodIndex(["2017-01-01"], freq="D")) - - def test_is_datetime64_dtype(): assert not com.is_datetime64_dtype(object) assert not com.is_datetime64_dtype([1, 2, 3]) @@ -309,7 +290,7 @@ def test_is_datetime_arraylike(): assert com.is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) -integer_dtypes = [] # type: List +integer_dtypes: List = [] @pytest.mark.parametrize( @@ -341,7 +322,7 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) -signed_integer_dtypes = [] # type: List +signed_integer_dtypes: List = [] @pytest.mark.parametrize( @@ -377,7 +358,7 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) -unsigned_integer_dtypes = [] # type: List +unsigned_integer_dtypes: List = [] @pytest.mark.parametrize( @@ -548,7 +529,11 @@ def test_is_bool_dtype(): assert com.is_bool_dtype(np.array([True, False])) assert com.is_bool_dtype(pd.Index([True, False])) + assert com.is_bool_dtype(pd.BooleanDtype()) + assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean")) + +@pytest.mark.filterwarnings("ignore:'is_extension_type' is deprecated:FutureWarning") @pytest.mark.parametrize( "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] ) @@ -573,6 +558,35 @@ def test_is_extension_type(check_scipy): assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3])) +def test_is_extension_type_deprecation(): + with tm.assert_produces_warning(FutureWarning): + com.is_extension_type([1, 2, 3]) + + +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) +def test_is_extension_array_dtype(check_scipy): + assert not com.is_extension_array_dtype([1, 2, 3]) + assert not com.is_extension_array_dtype(np.array([1, 2, 3])) + assert not com.is_extension_array_dtype(pd.DatetimeIndex([1, 2, 3])) + + cat = pd.Categorical([1, 2, 3]) + assert com.is_extension_array_dtype(cat) + assert com.is_extension_array_dtype(pd.Series(cat)) + assert com.is_extension_array_dtype(pd.SparseArray([1, 2, 3])) + assert com.is_extension_array_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + + dtype = DatetimeTZDtype("ns", tz="US/Eastern") + s = pd.Series([], dtype=dtype) + assert com.is_extension_array_dtype(s) + + if check_scipy: + import scipy.sparse + + assert not com.is_extension_array_dtype(scipy.sparse.bsr_matrix([1, 2, 3])) + + def test_is_complex_dtype(): assert not com.is_complex_dtype(int) assert not com.is_complex_dtype(str) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index f4bf4c1fc83d9..fc896e6a9d348 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -12,10 +12,8 @@ is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetimetz, is_dtype_equal, is_interval_dtype, - is_period, is_period_dtype, is_string_dtype, ) @@ -294,25 +292,15 @@ def test_basic(self): assert not is_datetime64tz_dtype(np.dtype("float64")) assert not is_datetime64tz_dtype(1.0) - with tm.assert_produces_warning(FutureWarning): - assert is_datetimetz(s) - assert is_datetimetz(s.dtype) - assert not is_datetimetz(np.dtype("float64")) - assert not is_datetimetz(1.0) - def test_dst(self): dr1 = date_range("2013-01-01", periods=3, tz="US/Eastern") s1 = Series(dr1, name="A") assert is_datetime64tz_dtype(s1) - with tm.assert_produces_warning(FutureWarning): - assert is_datetimetz(s1) dr2 = date_range("2013-08-01", periods=3, tz="US/Eastern") s2 = Series(dr2, name="A") assert is_datetime64tz_dtype(s2) - with tm.assert_produces_warning(FutureWarning): - assert is_datetimetz(s2) assert s1.dtype == s2.dtype @pytest.mark.parametrize("tz", ["UTC", "US/Eastern"]) @@ -457,22 +445,14 @@ def test_basic(self): assert is_period_dtype(pidx.dtype) assert is_period_dtype(pidx) - with tm.assert_produces_warning(FutureWarning): - assert is_period(pidx) s = Series(pidx, name="A") assert is_period_dtype(s.dtype) assert is_period_dtype(s) - with tm.assert_produces_warning(FutureWarning): - assert is_period(s) assert not is_period_dtype(np.dtype("float64")) assert not is_period_dtype(1.0) - with tm.assert_produces_warning(FutureWarning): - assert not is_period(np.dtype("float64")) - with tm.assert_produces_warning(FutureWarning): - assert not is_period(1.0) def test_empty(self): dt = PeriodDtype() diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 62fb118f719e3..743b844917edf 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -51,6 +51,7 @@ Timestamp, isna, ) +from pandas.core.arrays import IntegerArray import pandas.util.testing as tm @@ -505,7 +506,7 @@ def test_convert_numeric_int64_uint64(self, case, coerce): result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("value", [-2 ** 63 - 1, 2 ** 64]) + @pytest.mark.parametrize("value", [-(2 ** 63) - 1, 2 ** 64]) def test_convert_int_overflow(self, value): # see gh-18584 arr = np.array([value], dtype=object) @@ -552,6 +553,20 @@ def test_maybe_convert_objects_datetime(self): out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1) tm.assert_numpy_array_equal(out, exp) + @pytest.mark.parametrize( + "exp", + [ + IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])), + IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])), + ], + ) + def test_maybe_convert_objects_nullable_integer(self, exp): + # GH27335 + arr = np.array([2, np.NaN], dtype=object) + result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=1) + + tm.assert_extension_array_equal(result, exp) + def test_mixed_dtypes_remain_object_array(self): # GH14956 array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index e968962caf0b7..5e4fb6d69e52c 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -62,10 +62,10 @@ class BaseArithmeticOpsTests(BaseOpsUtil): * divmod_exc = TypeError """ - series_scalar_exc = TypeError # type: Optional[Type[TypeError]] - frame_scalar_exc = TypeError # type: Optional[Type[TypeError]] - series_array_exc = TypeError # type: Optional[Type[TypeError]] - divmod_exc = TypeError # type: Optional[Type[TypeError]] + series_scalar_exc: Optional[Type[TypeError]] = TypeError + frame_scalar_exc: Optional[Type[TypeError]] = TypeError + series_array_exc: Optional[Type[TypeError]] = TypeError + divmod_exc: Optional[Type[TypeError]] = TypeError def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 93816e3a8a613..f9ba4b7a8ba16 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -166,7 +166,14 @@ def _concat_same_type(cls, to_concat): def _reduce(self, name, skipna=True, **kwargs): if skipna: - raise NotImplementedError("decimal does not support skipna=True") + # If we don't have any NAs, we can ignore skipna + if self.isna().any(): + other = self[~self.isna()] + return other._reduce(name, **kwargs) + + if name == "sum" and len(self) == 0: + # GH#29630 avoid returning int 0 or np.bool_(False) on old numpy + return decimal.Decimal(0) try: op = getattr(self.data, name) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 86724d4d09819..ce819c13c4498 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -145,7 +145,7 @@ class TestMissing(BaseDecimal, base.BaseMissingTests): class Reduce: def check_reduce(self, s, op_name, skipna): - if skipna or op_name in ["median", "skew", "kurt"]: + if op_name in ["median", "skew", "kurt"]: with pytest.raises(NotImplementedError): getattr(s, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index bc75ec6aeb2df..7e027a65eec3a 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -3,8 +3,6 @@ import pytest -from pandas.compat import PY36 - import pandas as pd from pandas.tests.extension import base import pandas.util.testing as tm @@ -180,9 +178,6 @@ def test_fillna_frame(self): unhashable = pytest.mark.skip(reason="Unhashable") -unstable = pytest.mark.skipif( - not PY36, reason="Dictionary order unstable" # 3.6 or higher -) class TestReduce(base.BaseNoReduceTests): @@ -199,20 +194,16 @@ def test_sort_values_frame(self): # TODO (EA.factorize): see if _values_for_factorize allows this. pass - @unstable def test_argsort(self, data_for_sorting): super().test_argsort(data_for_sorting) - @unstable def test_argsort_missing(self, data_missing_for_sorting): super().test_argsort_missing(data_missing_for_sorting) - @unstable @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending): super().test_sort_values(data_for_sorting, ascending) - @unstable @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_missing(self, data_missing_for_sorting, ascending): super().test_sort_values_missing(data_missing_for_sorting, ascending) @@ -280,7 +271,6 @@ def test_groupby_extension_apply(self): we'll be able to dispatch unique. """ - @unstable @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py new file mode 100644 index 0000000000000..089dd798b2512 --- /dev/null +++ b/pandas/tests/extension/test_boolean.py @@ -0,0 +1,333 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. + +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). + +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. + +""" +import numpy as np +import pytest + +from pandas.compat.numpy import _np_version_under1p14 + +import pandas as pd +from pandas.core.arrays.boolean import BooleanDtype +from pandas.tests.extension import base +import pandas.util.testing as tm + + +def make_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + +@pytest.fixture +def dtype(): + return BooleanDtype() + + +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) + + +@pytest.fixture +def data_for_twos(dtype): + return pd.array(np.ones(100), dtype=dtype) + + +@pytest.fixture +def data_missing(dtype): + return pd.array([np.nan, True], dtype=dtype) + + +@pytest.fixture +def data_for_sorting(dtype): + return pd.array([True, True, False], dtype=dtype) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + return pd.array([True, np.nan, False], dtype=dtype) + + +@pytest.fixture +def na_cmp(): + # we are np.nan + return lambda x, y: np.isnan(x) and np.isnan(y) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def data_for_grouping(dtype): + b = True + a = False + na = np.nan + return pd.array([b, b, na, na, a, a, b], dtype=dtype) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestArithmeticOps(base.BaseArithmeticOpsTests): + def check_opname(self, s, op_name, other, exc=None): + # overwriting to indicate ops don't raise an error + super().check_opname(s, op_name, other, exc=None) + + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + if op_name in ("__sub__", "__rsub__"): + # subtraction for bools raises TypeError (but not yet in 1.13) + if _np_version_under1p14: + pytest.skip("__sub__ does not yet raise in numpy 1.13") + with pytest.raises(TypeError): + op(s, other) + + return + + result = op(s, other) + expected = s.combine(other, op) + + if op_name in ( + "__floordiv__", + "__rfloordiv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", + ): + # combine keeps boolean type + expected = expected.astype("Int8") + elif op_name in ("__truediv__", "__rtruediv__"): + # combine with bools does not generate the correct result + # (numpy behaviour for div is to regard the bools as numeric) + expected = s.astype(float).combine(other, op) + if op_name == "__rpow__": + # for rpow, combine does not propagate NaN + expected[result.isna()] = np.nan + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + + def _check_divmod_op(self, s, op, other, exc=None): + # override to not raise an error + super()._check_divmod_op(s, op, other, None) + + @pytest.mark.skip(reason="BooleanArray does not error on ops") + def test_error(self, data, all_arithmetic_operators): + # other specific errors tested in the boolean array specific tests + pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + def check_opname(self, s, op_name, other, exc=None): + # overwriting to indicate ops don't raise an error + super().check_opname(s, op_name, other, exc=None) + + def _compare_other(self, s, data, op_name, other): + self.check_opname(s, op_name, other) + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.parametrize("na_sentinel", [-1, -2]) + def test_factorize(self, data_for_grouping, na_sentinel): + # override because we only have 2 unique values + labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + expected_labels = np.array( + [0, 0, na_sentinel, na_sentinel, 1, 1, 0], dtype=np.intp + ) + expected_uniques = data_for_grouping.take([0, 4]) + + tm.assert_numpy_array_equal(labels, expected_labels) + self.assert_extension_array_equal(uniques, expected_uniques) + + def test_combine_le(self, data_repeated): + # override because expected needs to be boolean instead of bool dtype + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype="boolean", + ) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean") + self.assert_series_equal(result, expected) + + def test_searchsorted(self, data_for_sorting, as_series): + # override because we only have 2 unique values + data_for_sorting = pd.array([True, False], dtype="boolean") + b, a = data_for_sorting + arr = type(data_for_sorting)._from_sequence([a, b]) + + if as_series: + arr = pd.Series(arr) + assert arr.searchsorted(a) == 0 + assert arr.searchsorted(a, side="right") == 1 + + assert arr.searchsorted(b) == 1 + assert arr.searchsorted(b, side="right") == 2 + + result = arr.searchsorted(arr.take([0, 1])) + expected = np.array([0, 1], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + # sorter + sorter = np.array([1, 0]) + assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + + +class TestCasting(base.BaseCastingTests): + pass + + +class TestGroupby(base.BaseGroupbyTests): + """ + Groupby-specific tests are overridden because boolean only has 2 + unique values, base tests uses 3 groups. + """ + + def test_grouping_grouper(self, data_for_grouping): + df = pd.DataFrame( + {"A": ["B", "B", None, None, "A", "A", "B"], "B": data_for_grouping} + ) + gr1 = df.groupby("A").grouper.groupings[0] + gr2 = df.groupby("B").grouper.groupings[0] + + tm.assert_numpy_array_equal(gr1.grouper, df.A.values) + tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) + + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("B", as_index=as_index).A.mean() + _, index = pd.factorize(data_for_grouping, sort=True) + + index = pd.Index(index, name="B") + expected = pd.Series([3, 1], index=index, name="A") + if as_index: + self.assert_series_equal(result, expected) + else: + expected = expected.reset_index() + self.assert_frame_equal(result, expected) + + def test_groupby_extension_no_sort(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("B", sort=False).A.mean() + _, index = pd.factorize(data_for_grouping, sort=False) + + index = pd.Index(index, name="B") + expected = pd.Series([1, 3], index=index, name="A") + self.assert_series_equal(result, expected) + + def test_groupby_extension_transform(self, data_for_grouping): + valid = data_for_grouping[~data_for_grouping.isna()] + df = pd.DataFrame({"A": [1, 1, 3, 3, 1], "B": valid}) + + result = df.groupby("B").A.transform(len) + expected = pd.Series([3, 3, 2, 2, 3], name="A") + + self.assert_series_equal(result, expected) + + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + df.groupby("B").apply(groupby_apply_op) + df.groupby("B").A.apply(groupby_apply_op) + df.groupby("A").apply(groupby_apply_op) + df.groupby("A").B.apply(groupby_apply_op) + + def test_groupby_apply_identity(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("A").B.apply(lambda x: x.array) + expected = pd.Series( + [ + df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + ], + index=pd.Index([1, 2, 3], name="A"), + name="B", + ) + self.assert_series_equal(result, expected) + + def test_in_numeric_groupby(self, data_for_grouping): + df = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 1], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1], + } + ) + result = df.groupby("A").sum().columns + + if data_for_grouping.dtype._is_numeric: + expected = pd.Index(["B", "C"]) + else: + expected = pd.Index(["C"]) + + tm.assert_index_equal(result, expected) + + +class TestNumericReduce(base.BaseNumericReduceTests): + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + # override parent function to cast to bool for min/max + if op_name in ("min", "max") and not pd.isna(expected): + expected = bool(expected) + tm.assert_almost_equal(result, expected) + + +class TestBooleanReduce(base.BaseBooleanReduceTests): + pass + + +class TestPrinting(base.BasePrintingTests): + pass + + +# TODO parsing not yet supported +# class TestParsing(base.BaseParsingTests): +# pass diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py new file mode 100644 index 0000000000000..b595e48797d41 --- /dev/null +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -0,0 +1,388 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, DataFrame, Index, Series +import pandas.util.testing as tm + + +class TestDataFrameIndexingCategorical: + def test_assignment(self): + # assignment + df = DataFrame( + {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} + ) + labels = Categorical( + ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + ) + + df = df.sort_values(by=["value"], ascending=True) + s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) + d = s.values + df["D"] = d + str(df) + + result = df.dtypes + expected = Series( + [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], + index=["value", "D"], + ) + tm.assert_series_equal(result, expected) + + df["E"] = s + str(df) + + result = df.dtypes + expected = Series( + [ + np.dtype("int32"), + CategoricalDtype(categories=labels, ordered=False), + CategoricalDtype(categories=labels, ordered=False), + ], + index=["value", "D", "E"], + ) + tm.assert_series_equal(result, expected) + + result1 = df["D"] + result2 = df["E"] + tm.assert_categorical_equal(result1._data._block.values, d) + + # sorting + s.name = "E" + tm.assert_series_equal(result2.sort_index(), s.sort_index()) + + cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) + df = DataFrame(Series(cat)) + + def test_assigning_ops(self): + # systematically test the assigning operations: + # for all slicing ops: + # for value in categories and value not in categories: + + # - assign a single value -> exp_single_cats_value + + # - assign a complete row (mixed values) -> exp_single_row + + # assign multiple rows (mixed values) (-> array) -> exp_multi_row + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) + values = [1, 1, 1, 1, 1, 1, 1] + orig = DataFrame({"cats": cats, "values": values}, index=idx) + + # the expected values + # changed single row + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values1 = [1, 1, 2, 1, 1, 1, 1] + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) + + # changed multiple rows + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) + + # changed part of the cats column + cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values3 = [1, 1, 1, 1, 1, 1, 1] + exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) + + # changed single value in cats col + cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values4 = [1, 1, 1, 1, 1, 1, 1] + exp_single_cats_value = DataFrame( + {"cats": cats4, "values": values4}, index=idx4 + ) + + # iloc + # ############### + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.iloc[2, 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.iloc[df.index == "j", 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2, 0] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.iloc[2, :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2, :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.iloc[2:4, :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2:4, :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.iloc[2:4, 0] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.iloc[2:4, 0] = ["c", "c"] + + # loc + # ############## + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.loc["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.loc[df.index == "j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", "cats"] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.loc["j", :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.loc["j":"k", :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j":"k", :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical( + ["b", "b"], categories=["a", "b", "c"] + ) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical( + ["c", "c"], categories=["a", "b", "c"] + ) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", "cats"] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.loc["j":"k", "cats"] = ["c", "c"] + + # loc + # ############## + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.loc["j", df.columns[0]] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.loc[df.index == "j", df.columns[0]] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", df.columns[0]] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.loc["j", :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.loc["j":"k", :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j":"k", :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical( + ["b", "b"], categories=["a", "b", "c"] + ) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical( + ["c", "c"], categories=["a", "b", "c"] + ) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", df.columns[0]] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.loc["j":"k", df.columns[0]] = ["c", "c"] + + # iat + df = orig.copy() + df.iat[2, 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iat[2, 0] = "c" + + # at + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.at["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.at["j", "cats"] = "c" + + # fancy indexing + catsf = Categorical( + ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] + ) + idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) + valuesf = [1, 1, 3, 3, 1, 1, 1] + df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) + + exp_fancy = exp_multi_row.copy() + exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) + + df[df["cats"] == "c"] = ["b", 2] + # category c is kept in .categories + tm.assert_frame_equal(df, exp_fancy) + + # set_value + df = orig.copy() + df.at["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + with pytest.raises(ValueError): + df = orig.copy() + df.at["j", "cats"] = "c" + + # Assigning a Category to parts of a int/... column uses the values of + # the Categorical + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) + exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) + df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) + df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp) + + def test_functions_no_warnings(self): + df = DataFrame({"value": np.random.randint(0, 100, 20)}) + labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] + with tm.assert_produces_warning(False): + df["group"] = pd.cut( + df.value, range(0, 105, 10), right=False, labels=labels + ) + + def test_loc_indexing_preserves_index_category_dtype(self): + # GH 15166 + df = DataFrame( + data=np.arange(2, 22, 2), + index=pd.MultiIndex( + levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"], + ), + ) + + expected = pd.CategoricalIndex( + ["a", "b"], + categories=["a", "b"], + ordered=False, + name="Index1", + dtype="category", + ) + + result = df.index.levels[0] + tm.assert_index_equal(result, expected) + + result = df.loc[["a"]].index.levels[0] + tm.assert_index_equal(result, expected) + + def test_wrong_length_cat_dtype_raises(self): + # GH29523 + cat = pd.Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) + df = pd.DataFrame({"bar": range(10)}) + err = "Length of values does not match length of index" + with pytest.raises(ValueError, match=err): + df["foo"] = cat diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py new file mode 100644 index 0000000000000..bde35c04acf4f --- /dev/null +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -0,0 +1,62 @@ +import pandas as pd +from pandas import DataFrame, Index, Series, date_range, notna +import pandas.util.testing as tm + + +class TestDataFrameIndexingDatetimeWithTZ: + def test_setitem(self, timezone_frame): + + df = timezone_frame + idx = df["B"].rename("foo") + + # setitem + df["C"] = idx + tm.assert_series_equal(df["C"], Series(idx, name="C")) + + df["D"] = "foo" + df["D"] = idx + tm.assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + b1 = df._data.blocks[1] + b2 = df._data.blocks[2] + tm.assert_extension_array_equal(b1.values, b2.values) + assert id(b1.values._data.base) != id(b2.values._data.base) + + # with nan + df2 = df.copy() + df2.iloc[1, 1] = pd.NaT + df2.iloc[1, 2] = pd.NaT + result = df2["B"] + tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) + tm.assert_series_equal(df2.dtypes, df.dtypes) + + def test_set_reset(self): + + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype, "M8[ns, US/Eastern" + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + + def test_transpose(self, timezone_frame): + + result = timezone_frame.T + expected = DataFrame(timezone_frame.values.T) + expected.index = ["A", "B", "C"] + tm.assert_frame_equal(result, expected) + + def test_scalar_assignment(self): + # issue #19843 + df = pd.DataFrame(index=(0, 1, 2)) + df["now"] = pd.Timestamp("20130101", tz="UTC") + expected = pd.DataFrame( + {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] + ) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py similarity index 74% rename from pandas/tests/frame/test_indexing.py rename to pandas/tests/frame/indexing/test_indexing.py index ba7a4e2607a04..29a698d29bfa3 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -7,12 +7,10 @@ from pandas._libs.tslib import iNaT -from pandas.core.dtypes.common import is_float_dtype, is_integer, is_scalar -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.common import is_float_dtype, is_integer import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, @@ -2626,6 +2624,17 @@ def test_index_namedtuple(self): result = df.loc[IndexType("foo", "bar")]["A"] assert result == 1 + @pytest.mark.parametrize("tpl", [tuple([1]), tuple([1, 2])]) + def test_index_single_double_tuples(self, tpl): + # GH 20991 + idx = pd.Index([tuple([1]), tuple([1, 2])], name="A", tupleize_cols=False) + df = DataFrame(index=idx) + + result = df.loc[[tpl]] + idx = pd.Index([tpl], name="A", tupleize_cols=False) + expected = DataFrame(index=idx) + tm.assert_frame_equal(result, expected) + def test_boolean_indexing(self): idx = list(range(3)) cols = ["A", "B", "C"] @@ -2695,576 +2704,6 @@ def test_boolean_indexing_mixed(self): with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 - def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): - default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) - - def _safe_add(df): - # only add to the numeric items - def is_ok(s): - return ( - issubclass(s.dtype.type, (np.integer, np.floating)) - and s.dtype != "uint8" - ) - - return DataFrame( - dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) - ) - - def _check_get(df, cond, check_dtypes=True): - other1 = _safe_add(df) - rs = df.where(cond, other1) - rs2 = df.where(cond.values, other1) - for k, v in rs.items(): - exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) - tm.assert_series_equal(v, exp, check_names=False) - tm.assert_frame_equal(rs, rs2) - - # dtypes - if check_dtypes: - assert (rs.dtypes == df.dtypes).all() - - # check getting - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - cond = df > 0 - _check_get(df, cond) - - # upcasting case (GH # 2794) - df = DataFrame( - { - c: Series([1] * 3, dtype=c) - for c in ["float32", "float64", "int32", "int64"] - } - ) - df.iloc[1, :] = 0 - result = df.dtypes - expected = Series( - [ - np.dtype("float32"), - np.dtype("float64"), - np.dtype("int32"), - np.dtype("int64"), - ], - index=["float32", "float64", "int32", "int64"], - ) - - # when we don't preserve boolean casts - # - # expected = Series({ 'float32' : 1, 'float64' : 3 }) - - tm.assert_series_equal(result, expected) - - # aligning - def _check_align(df, cond, other, check_dtypes=True): - rs = df.where(cond, other) - for i, k in enumerate(rs.columns): - result = rs[k] - d = df[k].values - c = cond[k].reindex(df[k].index).fillna(False).values - - if is_scalar(other): - o = other - else: - if isinstance(other, np.ndarray): - o = Series(other[:, i], index=result.index).values - else: - o = other[k].values - - new_values = d if c.all() else np.where(c, d, o) - expected = Series(new_values, index=result.index, name=k) - - # since we can't always have the correct numpy dtype - # as numpy doesn't know how to downcast, don't check - tm.assert_series_equal(result, expected, check_dtype=False) - - # dtypes - # can't check dtype when other is an ndarray - - if check_dtypes and not isinstance(other, np.ndarray): - assert (rs.dtypes == df.dtypes).all() - - for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - - # other is a frame - cond = (df > 0)[1:] - _check_align(df, cond, _safe_add(df)) - - # check other is ndarray - cond = df > 0 - _check_align(df, cond, (_safe_add(df).values)) - - # integers are upcast, so don't check the dtypes - cond = df > 0 - check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) - _check_align(df, cond, np.nan, check_dtypes=check_dtypes) - - # invalid conditions - df = default_frame - err1 = (df + 1).values[0:2, :] - msg = "other must be the same shape as self when an ndarray" - with pytest.raises(ValueError, match=msg): - df.where(cond, err1) - - err2 = cond.iloc[:2, :].values - other1 = _safe_add(df) - msg = "Array conditional must be same shape as self" - with pytest.raises(ValueError, match=msg): - df.where(err2, other1) - - with pytest.raises(ValueError, match=msg): - df.mask(True) - with pytest.raises(ValueError, match=msg): - df.mask(0) - - # where inplace - def _check_set(df, cond, check_dtypes=True): - dfi = df.copy() - econd = cond.reindex_like(df).fillna(True) - expected = dfi.mask(~econd) - - dfi.where(cond, np.nan, inplace=True) - tm.assert_frame_equal(dfi, expected) - - # dtypes (and confirm upcasts)x - if check_dtypes: - for k, v in df.dtypes.items(): - if issubclass(v.type, np.integer) and not cond[k].all(): - v = np.dtype("float64") - assert dfi[k].dtype == v - - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - - cond = df > 0 - _check_set(df, cond) - - cond = df >= 0 - _check_set(df, cond) - - # aligning - cond = (df >= 0)[1:] - _check_set(df, cond) - - # GH 10218 - # test DataFrame.where with Series slicing - df = DataFrame({"a": range(3), "b": range(4, 7)}) - result = df.where(df["a"] == 1) - expected = df[df["a"] == 1].reindex(df.index) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("klass", [list, tuple, np.array]) - def test_where_array_like(self, klass): - # see gh-15414 - df = DataFrame({"a": [1, 2, 3]}) - cond = [[False], [True], [True]] - expected = DataFrame({"a": [np.nan, 2, 3]}) - - result = df.where(klass(cond)) - tm.assert_frame_equal(result, expected) - - df["b"] = 2 - expected["b"] = [2, np.nan, 2] - cond = [[False, True], [True, False], [True, True]] - - result = df.where(klass(cond)) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "cond", - [ - [[1], [0], [1]], - Series([[2], [5], [7]]), - DataFrame({"a": [2, 5, 7]}), - [["True"], ["False"], ["True"]], - [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], - ], - ) - def test_where_invalid_input_single(self, cond): - # see gh-15414: only boolean arrays accepted - df = DataFrame({"a": [1, 2, 3]}) - msg = "Boolean array expected for the condition" - - with pytest.raises(ValueError, match=msg): - df.where(cond) - - @pytest.mark.parametrize( - "cond", - [ - [[0, 1], [1, 0], [1, 1]], - Series([[0, 2], [5, 0], [4, 7]]), - [["False", "True"], ["True", "False"], ["True", "True"]], - DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), - [ - [pd.NaT, Timestamp("2017-01-01")], - [Timestamp("2017-01-02"), pd.NaT], - [Timestamp("2017-01-03"), Timestamp("2017-01-03")], - ], - ], - ) - def test_where_invalid_input_multiple(self, cond): - # see gh-15414: only boolean arrays accepted - df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) - msg = "Boolean array expected for the condition" - - with pytest.raises(ValueError, match=msg): - df.where(cond) - - def test_where_dataframe_col_match(self): - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - cond = DataFrame([[True, False, True], [False, False, True]]) - - result = df.where(cond) - expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) - tm.assert_frame_equal(result, expected) - - # this *does* align, though has no matching columns - cond.columns = ["a", "b", "c"] - result = df.where(cond) - expected = DataFrame(np.nan, index=df.index, columns=df.columns) - tm.assert_frame_equal(result, expected) - - def test_where_ndframe_align(self): - msg = "Array conditional must be same shape as self" - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - - cond = [True] - with pytest.raises(ValueError, match=msg): - df.where(cond) - - expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) - - out = df.where(Series(cond)) - tm.assert_frame_equal(out, expected) - - cond = np.array([False, True, False, True]) - with pytest.raises(ValueError, match=msg): - df.where(cond) - - expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) - - out = df.where(Series(cond)) - tm.assert_frame_equal(out, expected) - - def test_where_bug(self): - # see gh-2793 - df = DataFrame( - {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" - ) - expected = DataFrame( - {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64", - ) - result = df.where(df > 2, np.nan) - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(result > 2, np.nan, inplace=True) - tm.assert_frame_equal(result, expected) - - def test_where_bug_mixed(self, sint_dtype): - # see gh-2793 - df = DataFrame( - { - "a": np.array([1, 2, 3, 4], dtype=sint_dtype), - "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), - } - ) - - expected = DataFrame( - {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64", - ) - - result = df.where(df > 2, np.nan) - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(result > 2, np.nan, inplace=True) - tm.assert_frame_equal(result, expected) - - def test_where_bug_transposition(self): - # see gh-7506 - a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]}) - b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]}) - do_not_replace = b.isna() | (a > b) - - expected = a.copy() - expected[~do_not_replace] = b - - result = a.where(do_not_replace, b) - tm.assert_frame_equal(result, expected) - - a = DataFrame({0: [4, 6], 1: [1, 0]}) - b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]}) - do_not_replace = b.isna() | (a > b) - - expected = a.copy() - expected[~do_not_replace] = b - - result = a.where(do_not_replace, b) - tm.assert_frame_equal(result, expected) - - def test_where_datetime(self): - - # GH 3311 - df = DataFrame( - dict( - A=date_range("20130102", periods=5), - B=date_range("20130104", periods=5), - C=np.random.randn(5), - ) - ) - - stamp = datetime(2013, 1, 3) - with pytest.raises(TypeError): - df > stamp - - result = df[df.iloc[:, :-1] > stamp] - - expected = df.copy() - expected.loc[[0, 1], "A"] = np.nan - expected.loc[:, "C"] = np.nan - tm.assert_frame_equal(result, expected) - - def test_where_none(self): - # GH 4667 - # setting with None changes dtype - df = DataFrame({"series": Series(range(10))}).astype(float) - df[df > 7] = None - expected = DataFrame( - {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} - ) - tm.assert_frame_equal(df, expected) - - # GH 7656 - df = DataFrame( - [ - {"A": 1, "B": np.nan, "C": "Test"}, - {"A": np.nan, "B": "Test", "C": np.nan}, - ] - ) - msg = "boolean setting on mixed-type" - - with pytest.raises(TypeError, match=msg): - df.where(~isna(df), None, inplace=True) - - def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): - # see gh-21947 - df = pd.DataFrame(columns=["a"]) - cond = df.applymap(lambda x: x > 0) - - result = df.where(cond) - tm.assert_frame_equal(result, df) - - def test_where_align(self): - def create(): - df = DataFrame(np.random.randn(10, 3)) - df.iloc[3:5, 0] = np.nan - df.iloc[4:6, 1] = np.nan - df.iloc[5:8, 2] = np.nan - return df - - # series - df = create() - expected = df.fillna(df.mean()) - result = df.where(pd.notna(df), df.mean(), axis="columns") - tm.assert_frame_equal(result, expected) - - df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") - tm.assert_frame_equal(df, expected) - - df = create().fillna(0) - expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) - result = df.where(df > 0, df[0], axis="index") - tm.assert_frame_equal(result, expected) - result = df.where(df > 0, df[0], axis="rows") - tm.assert_frame_equal(result, expected) - - # frame - df = create() - expected = df.fillna(1) - result = df.where( - pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) - ) - tm.assert_frame_equal(result, expected) - - def test_where_complex(self): - # GH 6345 - expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) - df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) - df[df.abs() >= 5] = np.nan - tm.assert_frame_equal(df, expected) - - def test_where_axis(self): - # GH 9736 - df = DataFrame(np.random.randn(2, 2)) - mask = DataFrame([[False, False], [False, False]]) - s = Series([0, 1]) - - expected = DataFrame([[0, 0], [1, 1]], dtype="float64") - result = df.where(mask, s, axis="index") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[0, 1], [0, 1]], dtype="float64") - result = df.where(mask, s, axis="columns") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - # Upcast needed - df = DataFrame([[1, 2], [3, 4]], dtype="int64") - mask = DataFrame([[False, False], [False, False]]) - s = Series([0, np.nan]) - - expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") - result = df.where(mask, s, axis="index") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[0, np.nan], [0, np.nan]]) - result = df.where(mask, s, axis="columns") - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - { - 0: np.array([0, 0], dtype="int64"), - 1: np.array([np.nan, np.nan], dtype="float64"), - } - ) - result = df.copy() - result.where(mask, s, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - # Multiple dtypes (=> multiple Blocks) - df = pd.concat( - [ - DataFrame(np.random.randn(10, 2)), - DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), - ], - ignore_index=True, - axis=1, - ) - mask = DataFrame(False, columns=df.columns, index=df.index) - s1 = Series(1, index=df.columns) - s2 = Series(2, index=df.index) - - result = df.where(mask, s1, axis="columns") - expected = DataFrame(1.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype("int64") - expected[3] = expected[3].astype("int64") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s1, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - result = df.where(mask, s2, axis="index") - expected = DataFrame(2.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype("int64") - expected[3] = expected[3].astype("int64") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s2, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - # DataFrame vs DataFrame - d1 = df.copy().drop(1, axis=0) - expected = df.copy() - expected.loc[1, :] = np.nan - - result = df.where(mask, d1) - tm.assert_frame_equal(result, expected) - result = df.where(mask, d1, axis="index") - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d1, inplace=True) - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d1, inplace=True, axis="index") - tm.assert_frame_equal(result, expected) - - d2 = df.copy().drop(1, axis=1) - expected = df.copy() - expected.loc[:, 1] = np.nan - - result = df.where(mask, d2) - tm.assert_frame_equal(result, expected) - result = df.where(mask, d2, axis="columns") - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d2, inplace=True) - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d2, inplace=True, axis="columns") - tm.assert_frame_equal(result, expected) - - def test_where_callable(self): - # GH 12533 - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - result = df.where(lambda x: x > 4, lambda x: x + 1) - exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.where(df > 4, df + 1)) - - # return ndarray and scalar - result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99) - exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.where(df % 2 == 0, 99)) - - # chain - result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) - exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) - - def test_where_tz_values(self, tz_naive_fixture): - df1 = DataFrame( - DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), - columns=["date"], - ) - df2 = DataFrame( - DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), - columns=["date"], - ) - mask = DataFrame([True, True, False], columns=["date"]) - exp = DataFrame( - DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), - columns=["date"], - ) - result = df1.where(mask, df2) - tm.assert_frame_equal(exp, result) - def test_mask(self): df = DataFrame(np.random.randn(5, 3)) cond = df > 0 @@ -3402,65 +2841,6 @@ def test_interval_index(self): tm.assert_series_equal(result, expected) -class TestDataFrameIndexingDatetimeWithTZ: - def test_setitem(self, timezone_frame): - - df = timezone_frame - idx = df["B"].rename("foo") - - # setitem - df["C"] = idx - tm.assert_series_equal(df["C"], Series(idx, name="C")) - - df["D"] = "foo" - df["D"] = idx - tm.assert_series_equal(df["D"], Series(idx, name="D")) - del df["D"] - - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._data.blocks[1] - b2 = df._data.blocks[2] - tm.assert_extension_array_equal(b1.values, b2.values) - assert id(b1.values._data.base) != id(b2.values._data.base) - - # with nan - df2 = df.copy() - df2.iloc[1, 1] = pd.NaT - df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal(df2.dtypes, df.dtypes) - - def test_set_reset(self): - - idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype, "M8[ns, US/Eastern" - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) - - def test_transpose(self, timezone_frame): - - result = timezone_frame.T - expected = DataFrame(timezone_frame.values.T) - expected.index = ["A", "B", "C"] - tm.assert_frame_equal(result, expected) - - def test_scalar_assignment(self): - # issue #19843 - df = pd.DataFrame(index=(0, 1, 2)) - df["now"] = pd.Timestamp("20130101", tz="UTC") - expected = pd.DataFrame( - {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] - ) - tm.assert_frame_equal(df, expected) - - class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): @@ -3508,361 +2888,4 @@ def test_transpose(self, uint64_frame): result = uint64_frame.T expected = DataFrame(uint64_frame.values.T) expected.index = ["A", "B"] - tm.assert_frame_equal(result, expected) - - -class TestDataFrameIndexingCategorical: - def test_assignment(self): - # assignment - df = DataFrame( - {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} - ) - labels = Categorical( - ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - ) - - df = df.sort_values(by=["value"], ascending=True) - s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) - d = s.values - df["D"] = d - str(df) - - result = df.dtypes - expected = Series( - [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], - index=["value", "D"], - ) - tm.assert_series_equal(result, expected) - - df["E"] = s - str(df) - - result = df.dtypes - expected = Series( - [ - np.dtype("int32"), - CategoricalDtype(categories=labels, ordered=False), - CategoricalDtype(categories=labels, ordered=False), - ], - index=["value", "D", "E"], - ) - tm.assert_series_equal(result, expected) - - result1 = df["D"] - result2 = df["E"] - tm.assert_categorical_equal(result1._data._block.values, d) - - # sorting - s.name = "E" - tm.assert_series_equal(result2.sort_index(), s.sort_index()) - - cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) - df = DataFrame(Series(cat)) - - def test_assigning_ops(self): - # systematically test the assigning operations: - # for all slicing ops: - # for value in categories and value not in categories: - - # - assign a single value -> exp_single_cats_value - - # - assign a complete row (mixed values) -> exp_single_row - - # assign multiple rows (mixed values) (-> array) -> exp_multi_row - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - - cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) - idx = Index(["h", "i", "j", "k", "l", "m", "n"]) - values = [1, 1, 1, 1, 1, 1, 1] - orig = DataFrame({"cats": cats, "values": values}, index=idx) - - # the expected values - # changed single row - cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) - - # changed multiple rows - cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) - - # changed part of the cats column - cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values3 = [1, 1, 1, 1, 1, 1, 1] - exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) - - # changed single value in cats col - cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values4 = [1, 1, 1, 1, 1, 1, 1] - exp_single_cats_value = DataFrame( - {"cats": cats4, "values": values4}, index=idx4 - ) - - # iloc - # ############### - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.iloc[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.iloc[df.index == "j", 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2, 0] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.iloc[2, :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2, :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.iloc[2:4, :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2:4, :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.iloc[2:4, 0] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.iloc[2:4, 0] = ["c", "c"] - - # loc - # ############## - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.loc["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.loc[df.index == "j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", "cats"] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", "cats"] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.loc["j":"k", "cats"] = ["c", "c"] - - # loc - # ############## - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.loc["j", df.columns[0]] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.loc[df.index == "j", df.columns[0]] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", df.columns[0]] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", df.columns[0]] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.loc["j":"k", df.columns[0]] = ["c", "c"] - - # iat - df = orig.copy() - df.iat[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iat[2, 0] = "c" - - # at - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.at["j", "cats"] = "c" - - # fancy indexing - catsf = Categorical( - ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] - ) - idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) - valuesf = [1, 1, 3, 3, 1, 1, 1] - df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) - - exp_fancy = exp_multi_row.copy() - exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) - - df[df["cats"] == "c"] = ["b", 2] - # category c is kept in .categories - tm.assert_frame_equal(df, exp_fancy) - - # set_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - with pytest.raises(ValueError): - df = orig.copy() - df.at["j", "cats"] = "c" - - # Assigning a Category to parts of a int/... column uses the values of - # the Categorical - df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) - exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) - df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) - df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp) - - def test_setitem_single_row_categorical(self): - # GH 25495 - df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) - categories = pd.Categorical(df["Alpha"], categories=["a", "b", "c"]) - df.loc[:, "Alpha"] = categories - - result = df["Alpha"] - expected = Series(categories, index=df.index, name="Alpha") - tm.assert_series_equal(result, expected) - - def test_functions_no_warnings(self): - df = DataFrame({"value": np.random.randint(0, 100, 20)}) - labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] - with tm.assert_produces_warning(False): - df["group"] = pd.cut( - df.value, range(0, 105, 10), right=False, labels=labels - ) + tm.assert_frame_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py new file mode 100644 index 0000000000000..4fea190f28d7b --- /dev/null +++ b/pandas/tests/frame/indexing/test_where.py @@ -0,0 +1,582 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range, isna +import pandas.util.testing as tm + + +class TestDataFrameIndexingWhere: + def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): + default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) + + def _safe_add(df): + # only add to the numeric items + def is_ok(s): + return ( + issubclass(s.dtype.type, (np.integer, np.floating)) + and s.dtype != "uint8" + ) + + return DataFrame( + dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) + ) + + def _check_get(df, cond, check_dtypes=True): + other1 = _safe_add(df) + rs = df.where(cond, other1) + rs2 = df.where(cond.values, other1) + for k, v in rs.items(): + exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) + tm.assert_series_equal(v, exp, check_names=False) + tm.assert_frame_equal(rs, rs2) + + # dtypes + if check_dtypes: + assert (rs.dtypes == df.dtypes).all() + + # check getting + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + cond = df > 0 + _check_get(df, cond) + + # upcasting case (GH # 2794) + df = DataFrame( + { + c: Series([1] * 3, dtype=c) + for c in ["float32", "float64", "int32", "int64"] + } + ) + df.iloc[1, :] = 0 + result = df.dtypes + expected = Series( + [ + np.dtype("float32"), + np.dtype("float64"), + np.dtype("int32"), + np.dtype("int64"), + ], + index=["float32", "float64", "int32", "int64"], + ) + + # when we don't preserve boolean casts + # + # expected = Series({ 'float32' : 1, 'float64' : 3 }) + + tm.assert_series_equal(result, expected) + + # aligning + def _check_align(df, cond, other, check_dtypes=True): + rs = df.where(cond, other) + for i, k in enumerate(rs.columns): + result = rs[k] + d = df[k].values + c = cond[k].reindex(df[k].index).fillna(False).values + + if is_scalar(other): + o = other + else: + if isinstance(other, np.ndarray): + o = Series(other[:, i], index=result.index).values + else: + o = other[k].values + + new_values = d if c.all() else np.where(c, d, o) + expected = Series(new_values, index=result.index, name=k) + + # since we can't always have the correct numpy dtype + # as numpy doesn't know how to downcast, don't check + tm.assert_series_equal(result, expected, check_dtype=False) + + # dtypes + # can't check dtype when other is an ndarray + + if check_dtypes and not isinstance(other, np.ndarray): + assert (rs.dtypes == df.dtypes).all() + + for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + + # other is a frame + cond = (df > 0)[1:] + _check_align(df, cond, _safe_add(df)) + + # check other is ndarray + cond = df > 0 + _check_align(df, cond, (_safe_add(df).values)) + + # integers are upcast, so don't check the dtypes + cond = df > 0 + check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) + _check_align(df, cond, np.nan, check_dtypes=check_dtypes) + + # invalid conditions + df = default_frame + err1 = (df + 1).values[0:2, :] + msg = "other must be the same shape as self when an ndarray" + with pytest.raises(ValueError, match=msg): + df.where(cond, err1) + + err2 = cond.iloc[:2, :].values + other1 = _safe_add(df) + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + df.where(err2, other1) + + with pytest.raises(ValueError, match=msg): + df.mask(True) + with pytest.raises(ValueError, match=msg): + df.mask(0) + + # where inplace + def _check_set(df, cond, check_dtypes=True): + dfi = df.copy() + econd = cond.reindex_like(df).fillna(True) + expected = dfi.mask(~econd) + + dfi.where(cond, np.nan, inplace=True) + tm.assert_frame_equal(dfi, expected) + + # dtypes (and confirm upcasts)x + if check_dtypes: + for k, v in df.dtypes.items(): + if issubclass(v.type, np.integer) and not cond[k].all(): + v = np.dtype("float64") + assert dfi[k].dtype == v + + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + + cond = df > 0 + _check_set(df, cond) + + cond = df >= 0 + _check_set(df, cond) + + # aligning + cond = (df >= 0)[1:] + _check_set(df, cond) + + # GH 10218 + # test DataFrame.where with Series slicing + df = DataFrame({"a": range(3), "b": range(4, 7)}) + result = df.where(df["a"] == 1) + expected = df[df["a"] == 1].reindex(df.index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("klass", [list, tuple, np.array]) + def test_where_array_like(self, klass): + # see gh-15414 + df = DataFrame({"a": [1, 2, 3]}) + cond = [[False], [True], [True]] + expected = DataFrame({"a": [np.nan, 2, 3]}) + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + df["b"] = 2 + expected["b"] = [2, np.nan, 2] + cond = [[False, True], [True, False], [True, True]] + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "cond", + [ + [[1], [0], [1]], + Series([[2], [5], [7]]), + DataFrame({"a": [2, 5, 7]}), + [["True"], ["False"], ["True"]], + [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], + ], + ) + def test_where_invalid_input_single(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + @pytest.mark.parametrize( + "cond", + [ + [[0, 1], [1, 0], [1, 1]], + Series([[0, 2], [5, 0], [4, 7]]), + [["False", "True"], ["True", "False"], ["True", "True"]], + DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), + [ + [pd.NaT, Timestamp("2017-01-01")], + [Timestamp("2017-01-02"), pd.NaT], + [Timestamp("2017-01-03"), Timestamp("2017-01-03")], + ], + ], + ) + def test_where_invalid_input_multiple(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + def test_where_dataframe_col_match(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + cond = DataFrame([[True, False, True], [False, False, True]]) + + result = df.where(cond) + expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) + tm.assert_frame_equal(result, expected) + + # this *does* align, though has no matching columns + cond.columns = ["a", "b", "c"] + result = df.where(cond) + expected = DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_where_ndframe_align(self): + msg = "Array conditional must be same shape as self" + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + + cond = [True] + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + cond = np.array([False, True, False, True]) + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + def test_where_bug(self): + # see gh-2793 + df = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" + ) + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) + result = df.where(df > 2, np.nan) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_where_bug_mixed(self, sint_dtype): + # see gh-2793 + df = DataFrame( + { + "a": np.array([1, 2, 3, 4], dtype=sint_dtype), + "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), + } + ) + + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) + + result = df.where(df > 2, np.nan) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_where_bug_transposition(self): + # see gh-7506 + a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]}) + b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + a = DataFrame({0: [4, 6], 1: [1, 0]}) + b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + def test_where_datetime(self): + + # GH 3311 + df = DataFrame( + dict( + A=date_range("20130102", periods=5), + B=date_range("20130104", periods=5), + C=np.random.randn(5), + ) + ) + + stamp = datetime(2013, 1, 3) + with pytest.raises(TypeError): + df > stamp + + result = df[df.iloc[:, :-1] > stamp] + + expected = df.copy() + expected.loc[[0, 1], "A"] = np.nan + expected.loc[:, "C"] = np.nan + tm.assert_frame_equal(result, expected) + + def test_where_none(self): + # GH 4667 + # setting with None changes dtype + df = DataFrame({"series": Series(range(10))}).astype(float) + df[df > 7] = None + expected = DataFrame( + {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} + ) + tm.assert_frame_equal(df, expected) + + # GH 7656 + df = DataFrame( + [ + {"A": 1, "B": np.nan, "C": "Test"}, + {"A": np.nan, "B": "Test", "C": np.nan}, + ] + ) + msg = "boolean setting on mixed-type" + + with pytest.raises(TypeError, match=msg): + df.where(~isna(df), None, inplace=True) + + def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): + # see gh-21947 + df = pd.DataFrame(columns=["a"]) + cond = df.applymap(lambda x: x > 0) + + result = df.where(cond) + tm.assert_frame_equal(result, df) + + def test_where_align(self): + def create(): + df = DataFrame(np.random.randn(10, 3)) + df.iloc[3:5, 0] = np.nan + df.iloc[4:6, 1] = np.nan + df.iloc[5:8, 2] = np.nan + return df + + # series + df = create() + expected = df.fillna(df.mean()) + result = df.where(pd.notna(df), df.mean(), axis="columns") + tm.assert_frame_equal(result, expected) + + df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") + tm.assert_frame_equal(df, expected) + + df = create().fillna(0) + expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) + result = df.where(df > 0, df[0], axis="index") + tm.assert_frame_equal(result, expected) + result = df.where(df > 0, df[0], axis="rows") + tm.assert_frame_equal(result, expected) + + # frame + df = create() + expected = df.fillna(1) + result = df.where( + pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) + ) + tm.assert_frame_equal(result, expected) + + def test_where_complex(self): + # GH 6345 + expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) + df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) + df[df.abs() >= 5] = np.nan + tm.assert_frame_equal(df, expected) + + def test_where_axis(self): + # GH 9736 + df = DataFrame(np.random.randn(2, 2)) + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, 1]) + + expected = DataFrame([[0, 0], [1, 1]], dtype="float64") + result = df.where(mask, s, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, 1], [0, 1]], dtype="float64") + result = df.where(mask, s, axis="columns") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + # Upcast needed + df = DataFrame([[1, 2], [3, 4]], dtype="int64") + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, np.nan]) + + expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") + result = df.where(mask, s, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, np.nan], [0, np.nan]]) + result = df.where(mask, s, axis="columns") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + 0: np.array([0, 0], dtype="int64"), + 1: np.array([np.nan, np.nan], dtype="float64"), + } + ) + result = df.copy() + result.where(mask, s, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + # Multiple dtypes (=> multiple Blocks) + df = pd.concat( + [ + DataFrame(np.random.randn(10, 2)), + DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), + ], + ignore_index=True, + axis=1, + ) + mask = DataFrame(False, columns=df.columns, index=df.index) + s1 = Series(1, index=df.columns) + s2 = Series(2, index=df.index) + + result = df.where(mask, s1, axis="columns") + expected = DataFrame(1.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s1, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + result = df.where(mask, s2, axis="index") + expected = DataFrame(2.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s2, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + # DataFrame vs DataFrame + d1 = df.copy().drop(1, axis=0) + expected = df.copy() + expected.loc[1, :] = np.nan + + result = df.where(mask, d1) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d1, axis="index") + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True) + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True, axis="index") + tm.assert_frame_equal(result, expected) + + d2 = df.copy().drop(1, axis=1) + expected = df.copy() + expected.loc[:, 1] = np.nan + + result = df.where(mask, d2) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d2, axis="columns") + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True) + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True, axis="columns") + tm.assert_frame_equal(result, expected) + + def test_where_callable(self): + # GH 12533 + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = df.where(lambda x: x > 4, lambda x: x + 1) + exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df > 4, df + 1)) + + # return ndarray and scalar + result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99) + exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df % 2 == 0, 99)) + + # chain + result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) + exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) + + def test_where_tz_values(self, tz_naive_fixture): + df1 = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + columns=["date"], + ) + df2 = DataFrame( + DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + mask = DataFrame([True, True, False], columns=["date"]) + exp = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + result = df1.where(mask, df2) + tm.assert_frame_equal(exp, result) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 9b76be18b0e88..6206b333d29e1 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -381,7 +381,7 @@ def test_set_index_custom_label_hashable_iterable(self): class Thing(frozenset): # need to stabilize repr for KeyError (due to random order in sets) def __repr__(self) -> str: - tmp = sorted(list(self)) + tmp = sorted(self) # double curly brace prints one brace in format string return "frozenset({{{}}})".format(", ".join(map(repr, tmp))) @@ -493,29 +493,29 @@ def test_convert_dti_to_series(self): tm.assert_series_equal(result, expected) # convert to series while keeping the timezone - result = idx.to_series(keep_tz=True, index=[0, 1]) + msg = "stop passing 'keep_tz'" + with tm.assert_produces_warning(FutureWarning) as m: + result = idx.to_series(keep_tz=True, index=[0, 1]) tm.assert_series_equal(result, expected) + assert msg in str(m[0].message) # convert to utc - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning) as m: df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) result = df["B"] comp = Series(DatetimeIndex(expected.values).tz_localize(None), name="B") tm.assert_series_equal(result, comp) - - with tm.assert_produces_warning(FutureWarning) as m: - result = idx.to_series(index=[0, 1]) - tm.assert_series_equal(result, expected.dt.tz_convert(None)) - msg = ( - "The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change to True in a future " - "release." - ) + msg = "do 'idx.tz_convert(None)' before calling" assert msg in str(m[0].message) - with tm.assert_produces_warning(FutureWarning): + result = idx.to_series(index=[0, 1]) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning) as m: result = idx.to_series(keep_tz=False, index=[0, 1]) tm.assert_series_equal(result, expected.dt.tz_convert(None)) + msg = "do 'idx.tz_convert(None)' before calling" + assert msg in str(m[0].message) # list of datetimes with a tz df["B"] = idx.to_pydatetime() @@ -745,8 +745,7 @@ def test_rename_axis_mapper(self): # GH 19978 mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) df = DataFrame( - {"x": [i for i in range(len(mi))], "y": [i * 10 for i in range(len(mi))]}, - index=mi, + {"x": list(range(len(mi))), "y": [i * 10 for i in range(len(mi))]}, index=mi ) # Test for rename of the Index object of columns diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index f694689fa9dfb..005ca8d95182e 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1244,7 +1244,7 @@ def test_mode_dropna(self, dropna, expected): } ) - result = df[sorted(list(expected.keys()))].mode(dropna=dropna) + result = df[sorted(expected.keys())].mode(dropna=dropna) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) @@ -2279,14 +2279,6 @@ def test_clip(self, float_frame): median = float_frame.median().median() original = float_frame.copy() - with tm.assert_produces_warning(FutureWarning): - capped = float_frame.clip_upper(median) - assert not (capped.values > median).any() - - with tm.assert_produces_warning(FutureWarning): - floored = float_frame.clip_lower(median) - assert not (floored.values < median).any() - double = float_frame.clip(upper=median, lower=median) assert not (double.values != median).any() @@ -2298,16 +2290,6 @@ def test_inplace_clip(self, float_frame): median = float_frame.median().median() frame_copy = float_frame.copy() - with tm.assert_produces_warning(FutureWarning): - frame_copy.clip_upper(median, inplace=True) - assert not (frame_copy.values > median).any() - frame_copy = float_frame.copy() - - with tm.assert_produces_warning(FutureWarning): - frame_copy.clip_lower(median, inplace=True) - assert not (frame_copy.values < median).any() - frame_copy = float_frame.copy() - frame_copy.clip(upper=median, lower=median, inplace=True) assert not (frame_copy.values != median).any() @@ -2759,8 +2741,7 @@ def test_series_broadcasting(self): s_nan = Series([np.nan, np.nan, 1]) with tm.assert_produces_warning(None): - with tm.assert_produces_warning(FutureWarning): - df_nan.clip_lower(s, axis=0) + df_nan.clip(lower=s, axis=0) for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 50b1dec21c549..a86e1dfe8353c 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -476,14 +476,6 @@ def test_values(self, float_frame): float_frame.values[:, 0] = 5.0 assert (float_frame.values[:, 0] == 5).all() - def test_as_matrix_deprecated(self, float_frame): - # GH 18458 - with tm.assert_produces_warning(FutureWarning): - cols = float_frame.columns.tolist() - result = float_frame.as_matrix(columns=cols) - expected = float_frame.values - tm.assert_numpy_array_equal(result, expected) - def test_deepcopy(self, float_frame): cp = deepcopy(float_frame) series = cp["A"] diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fea50b3b7f75d..3c97a87c95bd2 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -13,6 +13,7 @@ from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, notna from pandas.conftest import _get_cython_table_params from pandas.core.apply import frame_apply +from pandas.core.base import SpecificationError import pandas.util.testing as tm @@ -1094,7 +1095,8 @@ def test_agg_dict_nested_renaming_depr(self): df = pd.DataFrame({"A": range(5), "B": 5}) # nested renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) def test_agg_reduce(self, axis, float_frame): @@ -1259,6 +1261,23 @@ def test_non_callable_aggregates(self): assert result == expected + def test_agg_listlike_result(self): + # GH-29587 user defined function returning list-likes + df = DataFrame( + {"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]} + ) + + def func(group_col): + return list(group_col.dropna().unique()) + + result = df.agg(func) + expected = pd.Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg([func]) + expected = expected.to_frame("func").T + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "df, func, expected", chain( diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index b45c074f179a0..d491e9f25c897 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -313,10 +313,7 @@ def test_copy_blocks(self, float_frame): column = df.columns[0] # use the default copy=True, change a column - - # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - blocks = df.as_blocks() + blocks = df._to_dict_of_blocks(copy=True) for dtype, _df in blocks.items(): if column in _df: _df.loc[:, column] = _df[column] + 1 @@ -330,10 +327,7 @@ def test_no_copy_blocks(self, float_frame): column = df.columns[0] # use the copy=False, change a column - - # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - blocks = df.as_blocks(copy=False) + blocks = df._to_dict_of_blocks(copy=False) for dtype, _df in blocks.items(): if column in _df: _df.loc[:, column] = _df[column] + 1 diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index e3f37e1ef3186..e72de487abb2f 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -128,6 +128,20 @@ def test_concat_tuple_keys(self): ) tm.assert_frame_equal(results, expected) + def test_append_empty_list(self): + # GH 28769 + df = DataFrame() + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df + + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df # .append() should return a new object + def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) @@ -355,13 +369,6 @@ def test_update_raise_on_overlap(self): with pytest.raises(ValueError, match="Data overlaps"): df.update(other, errors="raise") - @pytest.mark.parametrize("raise_conflict", [True, False]) - def test_update_deprecation(self, raise_conflict): - df = DataFrame([[1.5, 1, 3.0]]) - other = DataFrame() - with tm.assert_produces_warning(FutureWarning): - df.update(other, raise_conflict=raise_conflict) - def test_update_from_non_df(self): d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} df = DataFrame(d) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1d030bbc75521..ce0ebdbe56354 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -8,9 +8,8 @@ import numpy.ma.mrecords as mrecords import pytest -from pandas.compat import PY36, is_platform_little_endian +from pandas.compat import is_platform_little_endian -from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -245,9 +244,9 @@ def test_constructor_overflow_int64(self): np.array([2 ** 64], dtype=object), np.array([2 ** 65]), [2 ** 64 + 1], - np.array([-2 ** 63 - 4], dtype=object), - np.array([-2 ** 64 - 1]), - [-2 ** 65 - 2], + np.array([-(2 ** 63) - 4], dtype=object), + np.array([-(2 ** 64) - 1]), + [-(2 ** 65) - 2], ], ) def test_constructor_int_overflow(self, values): @@ -387,7 +386,6 @@ def test_constructor_dict_nan_tuple_key(self, value): result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY36, reason="Insertion order for Python>=3.6") def test_constructor_dict_order_insertion(self): datetime_series = tm.makeTimeSeries(nper=30) datetime_series_short = tm.makeTimeSeries(nper=25) @@ -399,18 +397,6 @@ def test_constructor_dict_order_insertion(self): expected = DataFrame(data=d, columns=list("ba")) tm.assert_frame_equal(frame, expected) - @pytest.mark.skipif(PY36, reason="order by value for Python<3.6") - def test_constructor_dict_order_by_values(self): - datetime_series = tm.makeTimeSeries(nper=30) - datetime_series_short = tm.makeTimeSeries(nper=25) - - # GH19018 - # initialization ordering: by value if python<3.6 - d = {"b": datetime_series_short, "a": datetime_series} - frame = DataFrame(data=d) - expected = DataFrame(data=d, columns=list("ab")) - tm.assert_frame_equal(frame, expected) - def test_constructor_multi_index(self): # GH 4078 # construction error with mi and all-nan frame @@ -1373,7 +1359,7 @@ def test_constructor_list_of_dict_order(self): } ) result = DataFrame(data) - tm.assert_frame_equal(result, expected, check_like=not PY36) + tm.assert_frame_equal(result, expected) def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series @@ -1521,92 +1507,6 @@ def test_constructor_manager_resize(self, float_frame): tm.assert_index_equal(result.index, Index(index)) tm.assert_index_equal(result.columns, Index(columns)) - def test_constructor_from_items(self, float_frame, float_string_frame): - items = [(c, float_frame[c]) for c in float_frame.columns] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items(items) - tm.assert_frame_equal(recons, float_frame) - - # pass some columns - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items(items, columns=["C", "B", "A"]) - tm.assert_frame_equal(recons, float_frame.loc[:, ["C", "B", "A"]]) - - # orient='index' - - row_items = [ - (idx, float_string_frame.xs(idx)) for idx in float_string_frame.index - ] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items( - row_items, columns=float_string_frame.columns, orient="index" - ) - tm.assert_frame_equal(recons, float_string_frame) - assert recons["A"].dtype == np.float64 - - msg = "Must pass columns with orient='index'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items(row_items, orient="index") - - # orient='index', but thar be tuples - arr = construct_1d_object_array_from_listlike( - [("bar", "baz")] * len(float_string_frame) - ) - float_string_frame["foo"] = arr - row_items = [ - (idx, list(float_string_frame.xs(idx))) for idx in float_string_frame.index - ] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items( - row_items, columns=float_string_frame.columns, orient="index" - ) - tm.assert_frame_equal(recons, float_string_frame) - assert isinstance(recons["foo"][0], tuple) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - rs = DataFrame.from_items( - [("A", [1, 2, 3]), ("B", [4, 5, 6])], - orient="index", - columns=["one", "two", "three"], - ) - xp = DataFrame( - [[1, 2, 3], [4, 5, 6]], index=["A", "B"], columns=["one", "two", "three"] - ) - tm.assert_frame_equal(rs, xp) - - def test_constructor_from_items_scalars(self): - # GH 17312 - msg = ( - r"The value in each \(key, value\) " - "pair must be an array, Series, or dict" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items([("A", 1), ("B", 4)]) - - msg = ( - r"The value in each \(key, value\) " - "pair must be an array, Series, or dict" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items( - [("A", 1), ("B", 2)], columns=["col1"], orient="index" - ) - - def test_from_items_deprecation(self): - # GH 17320 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items([("A", [1, 2, 3]), ("B", [4, 5, 6])]) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items( - [("A", [1, 2, 3]), ("B", [4, 5, 6])], - columns=["col1", "col2", "col3"], - orient="index", - ) - def test_constructor_mix_series_nonseries(self, float_frame): df = DataFrame( {"A": float_frame["A"], "B": list(float_frame["B"])}, columns=["A", "B"] @@ -1808,7 +1708,7 @@ def test_constructor_with_datetimes(self): # preserver an index with a tz on dict construction i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern") - expected = DataFrame({"a": i.to_series(keep_tz=True).reset_index(drop=True)}) + expected = DataFrame({"a": i.to_series().reset_index(drop=True)}) df = DataFrame() df["a"] = i tm.assert_frame_equal(df, expected) @@ -1819,9 +1719,7 @@ def test_constructor_with_datetimes(self): # multiples i_no_tz = date_range("1/1/2011", periods=5, freq="10s") df = DataFrame({"a": i, "b": i_no_tz}) - expected = DataFrame( - {"a": i.to_series(keep_tz=True).reset_index(drop=True), "b": i_no_tz} - ) + expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz}) tm.assert_frame_equal(df, expected) def test_constructor_datetimes_with_nulls(self): diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index e1dda1411edbd..63a98fda974a6 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -83,23 +83,10 @@ def test_to_records_dt64(self): index=date_range("2012-01-01", "2012-01-02"), ) - # convert_datetime64 defaults to None expected = df.index.values[0] result = df.to_records()["index"][0] assert expected == result - # check for FutureWarning if convert_datetime64=False is passed - with tm.assert_produces_warning(FutureWarning): - expected = df.index.values[0] - result = df.to_records(convert_datetime64=False)["index"][0] - assert expected == result - - # check for FutureWarning if convert_datetime64=True is passed - with tm.assert_produces_warning(FutureWarning): - expected = df.index[0] - result = df.to_records(convert_datetime64=True)["index"][0] - assert expected == result - def test_to_records_with_multindex(self): # GH3189 index = [ diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 89fd7ccd91f81..cdcd5996324da 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -850,6 +850,31 @@ def test_astype_column_metadata(self, dtype): df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) + def test_df_where_change_dtype(self): + # GH 16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + result = df.where(mask) + expected = DataFrame( + [[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC") + ) + + tm.assert_frame_equal(result, expected) + + # change type to category + df.A = df.A.astype("category") + df.B = df.B.astype("category") + df.C = df.C.astype("category") + + result = df.where(mask) + A = pd.Categorical([0, np.nan], categories=[0, 3]) + B = pd.Categorical([np.nan, np.nan], categories=[1, 4]) + C = pd.Categorical([np.nan, 5], categories=[2, 5]) + expected = DataFrame({"A": A, "B": B, "C": C}) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 7feb55f2fac09..8c0dd67af4e7d 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import PY36 - from pandas import DataFrame, Index, MultiIndex, Series import pandas.util.testing as tm @@ -60,10 +58,7 @@ def test_assign_order(self): df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) result = df.assign(D=df.A + df.B, C=df.A - df.B) - if PY36: - expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) - else: - expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD")) + expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) tm.assert_frame_equal(result, expected) result = df.assign(C=df.A - df.B, D=df.A + df.B) @@ -80,25 +75,6 @@ def test_assign_bad(self): with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C) - @pytest.mark.skipif( - PY36, - reason="""Issue #14207: valid for python - 3.6 and above""", - ) - def test_assign_dependent_old_python(self): - df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - - # Key C does not exist at definition time of df - with pytest.raises(KeyError, match="^'C'$"): - df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) - with pytest.raises(KeyError, match="^'C'$"): - df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) - - @pytest.mark.skipif( - not PY36, - reason="""Issue #14207: not valid for - python 3.5 and below""", - ) def test_assign_dependent(self): df = DataFrame({"A": [1, 2], "B": [3, 4]}) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 19d91241d6a6b..f3e61dffb500d 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -218,6 +218,42 @@ def test_logical_with_nas(self): expected = Series([True, True]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "left, right, op, expected", + [ + ( + [True, False, np.nan], + [True, False, True], + operator.and_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.and_, + [True, False, False], + ), + ( + [True, False, np.nan], + [True, False, True], + operator.or_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.or_, + [True, False, True], + ), + ], + ) + def test_logical_operators_nans(self, left, right, op, expected): + # GH 13896 + result = op(DataFrame(left), DataFrame(right)) + expected = DataFrame(expected) + + tm.assert_frame_equal(result, expected) + class TestDataFrameOperators: @pytest.mark.parametrize( @@ -530,6 +566,16 @@ def test_comp(func): test_comp(operator.ge) test_comp(operator.le) + def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): + # GH 11565 + df = DataFrame( + {x: {"x": "foo", "y": "bar", "z": "baz"} for x in ["a", "b", "c"]} + ) + + f = getattr(operator, compare_operators_no_eq_ne) + with pytest.raises(TypeError): + f(df, 0) + def test_comparison_protected_from_errstate(self): missing_df = tm.makeDataFrame() missing_df.iloc[0]["A"] = np.nan diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 04d27f4c12c59..cd1bee356ed8e 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -991,7 +991,7 @@ def test_query_lex_compare_strings(self, parser, engine): ops = {"<": operator.lt, ">": operator.gt, "<=": operator.le, ">=": operator.ge} for op, func in ops.items(): - res = df.query('X %s "d"' % op, engine=engine, parser=parser) + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index c30efa121262f..434ea6ea7b4f0 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1296,6 +1296,24 @@ def test_replace_method(self, to_replace, method, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + expected = DataFrame(final_data, columns=["a", "b"], dtype="category") + expected["a"] = expected["a"].cat.set_categories([1, 2, 3]) + expected["b"] = expected["b"].cat.set_categories([1, 2, 3]) + result = df.replace(replace_dict, 3) + tm.assert_frame_equal(result, expected) + with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + df.replace(replace_dict, 3, inplace=True) + tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize( "df, to_replace, exp", [ @@ -1320,5 +1338,21 @@ def test_replace_commutative(self, df, to_replace, exp): expected = pd.DataFrame(exp) result = df.replace(to_replace) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "replacer", + [ + pd.Timestamp("20170827"), + np.int8(1), + np.int16(1), + np.float32(1), + np.float64(1), + ], + ) + def test_replace_replacer_dtype(self, replacer): + # GH26632 + df = pd.DataFrame(["a"]) + result = df.replace({"a": replacer, "b": replacer}) + expected = pd.DataFrame([replacer]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 5d2c115ce8eb5..5acd681933914 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -699,7 +699,7 @@ def verify(df): for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) - right = sorted(list(map(cast, right))) + right = sorted(map(cast, right)) assert left == right df = DataFrame( diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index ae452e6faef01..096a5aa99bd80 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -243,11 +243,6 @@ def test_to_xarray(self): assert isinstance(result, DataArray) tm.assert_series_equal(result.to_series(), s) - def test_valid_deprecated(self): - # GH18800 - with tm.assert_produces_warning(FutureWarning): - pd.Series([]).valid() - @pytest.mark.parametrize( "s", [ diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index c03ffe317083c..ea986058616d7 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -8,7 +8,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas import DataFrame, Index, MultiIndex, Series, concat from pandas.core.base import SpecificationError from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping @@ -267,16 +267,16 @@ def bar(x): return np.std(x, ddof=1) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): d = OrderedDict( [["C", np.mean], ["D", OrderedDict([["foo", np.mean], ["bar", np.std]])]] ) - result = grouped.aggregate(d) + grouped.aggregate(d) + # But without renaming, these functions are OK d = OrderedDict([["C", [np.mean]], ["D", [foo, bar]]]) - expected = grouped.aggregate(d) - - tm.assert_frame_equal(result, expected) + grouped.aggregate(d) def test_multi_function_flexible_mix(df): @@ -288,26 +288,25 @@ def test_multi_function_flexible_mix(df): [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", {"sum": "sum"}]] ) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = grouped.aggregate(d) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) # Test 1 d = OrderedDict( [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", "sum"]] ) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped.aggregate(d) - tm.assert_frame_equal(result, expected) + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) # Test 2 d = OrderedDict( [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", ["sum"]]] ) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped.aggregate(d) - tm.assert_frame_equal(result, expected) + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) def test_groupby_agg_coercing_bools(): @@ -361,9 +360,7 @@ def test_series_named_agg(self): tm.assert_frame_equal(result, expected) result = gr.agg(b="min", a="sum") - # sort for 35 and earlier - if compat.PY36: - expected = expected[["b", "a"]] + expected = expected[["b", "a"]] tm.assert_frame_equal(result, expected) def test_no_args_raises(self): @@ -425,8 +422,6 @@ def test_agg_relabel(self): index=pd.Index(["a", "b"], name="group"), columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"], ) - if not compat.PY36: - expected = expected[["a_98", "a_max", "a_mean", "a_min", "b_max", "b_min"]] tm.assert_frame_equal(result, expected) def test_agg_relabel_non_identifier(self): @@ -495,6 +490,80 @@ def test_mangled(self): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3", + [ + ( + (("y", "A"), "max"), + (("y", "A"), np.min), + (("y", "B"), "mean"), + [1, 3], + [0, 2], + [5.5, 7.5], + ), + ( + (("y", "A"), lambda x: max(x)), + (("y", "A"), lambda x: 1), + (("y", "B"), "mean"), + [1, 3], + [1, 1], + [5.5, 7.5], + ), + ( + pd.NamedAgg(("y", "A"), "max"), + pd.NamedAgg(("y", "B"), np.mean), + pd.NamedAgg(("y", "A"), lambda x: 1), + [1, 3], + [5.5, 7.5], + [1, 1], + ), + ], +) +def test_agg_relabel_multiindex_column( + agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3 +): + # GH 29422, add tests for multiindex column cases + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + idx = pd.Index(["a", "b"], name=("x", "group")) + + result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max")) + expected = DataFrame({"a_max": [1, 3]}, index=idx) + tm.assert_frame_equal(result, expected) + + result = df.groupby(("x", "group")).agg( + col_1=agg_col1, col_2=agg_col2, col_3=agg_col3 + ) + expected = DataFrame( + {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_multiindex_raises_not_exist(): + # GH 29422, add test for raises senario when aggregate column does not exist + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + with pytest.raises(KeyError, match="does not exist"): + df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) + + +def test_agg_relabel_multiindex_raises_duplicate(): + # GH29422, add test for raises senario when getting duplicates + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + with pytest.raises(SpecificationError, match="Function names"): + df.groupby(("x", "group")).agg(a=(("y", "A"), "min"), b=(("y", "A"), "min")) + + def myfunc(s): return np.percentile(s, q=0.90) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 1c297f3e2ada3..f14384928b979 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -211,31 +211,26 @@ def test_aggregate_api_consistency(): expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) - expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) - expected.columns = MultiIndex.from_product([["r", "r2"], ["D", "C"]]) - tm.assert_frame_equal(result, expected, check_like=True) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) def test_agg_dict_renaming_deprecation(): # 15931 df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): df.groupby("A").agg( {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} ) - assert "using a dict with renaming" in str(w[0].message) - assert "named aggregation" in str(w[0].message) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(SpecificationError, match=msg): df.groupby("A")[["B", "C"]].agg({"ma": "max"}) - with tm.assert_produces_warning(FutureWarning) as w: + with pytest.raises(SpecificationError, match=msg): df.groupby("A").B.agg({"foo": "count"}) - assert "using a dict on a Series for aggregation" in str(w[0].message) - assert "named aggregation instead." in str(w[0].message) def test_agg_compat(): @@ -251,18 +246,12 @@ def test_agg_compat(): g = df.groupby(["A", "B"]) - expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) - expected.columns = MultiIndex.from_tuples([("C", "sum"), ("C", "std")]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g["D"].agg({"C": ["sum", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) - - expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) - expected.columns = ["C", "D"] + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": ["sum", "std"]}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g["D"].agg({"C": "sum", "D": "std"}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": "sum", "D": "std"}) def test_agg_nested_dicts(): @@ -278,29 +267,20 @@ def test_agg_nested_dicts(): g = df.groupby(["A", "B"]) - msg = r"cannot perform renaming for r[1-2] with a nested dictionary" + msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) - expected = pd.concat( - [g["C"].mean(), g["C"].std(), g["D"].mean(), g["D"].std()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] - ) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) # same name as the original column # GH9052 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = g["D"].agg({"result1": np.sum, "result2": np.mean}) - expected = expected.rename(columns={"result1": "D"}) + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"result1": np.sum, "result2": np.mean}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g["D"].agg({"D": np.sum, "result2": np.mean}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"D": np.sum, "result2": np.mean}) def test_agg_item_by_item_raise_typeerror(): @@ -454,6 +434,31 @@ def test_agg_over_numpy_arrays(): tm.assert_frame_equal(result, expected) +def test_agg_tzaware_non_datetime_result(): + # discussed in GH#29589, fixed in GH#29641, operating on tzaware values + # with function that is not dtype-preserving + dti = pd.date_range("2012-01-01", periods=4, tz="UTC") + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": dti}) + gb = df.groupby("a") + + # Case that _does_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0]) + expected = pd.Series(dti[::2], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + # Cases that do _not_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0].year) + expected = pd.Series([2012, 2012], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) + expected = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + def test_agg_timezone_round_trip(): # GH 15426 ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index af98f9efe2af9..5b8cc86513954 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex -from pandas.core.groupby.base import reduction_kernels +from pandas.core.groupby.base import reduction_kernels, transformation_kernels import pandas.util.testing as tm @@ -110,3 +110,15 @@ def reduction_func(request): """yields the string names of all groupby reduction functions, one at a time. """ return request.param + + +@pytest.fixture(params=transformation_kernels) +def transformation_func(request): + """yields the string names of all groupby transformation functions.""" + return request.param + + +@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels)) +def groupby_func(request): + """yields both aggregation and transformation functions.""" + return request.param diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 0e7a66769d2d4..fcdf599e4ba33 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -25,6 +25,16 @@ def test_series_grouper(): tm.assert_almost_equal(counts, exp_counts) +def test_series_grouper_requires_nonempty_raises(): + # GH#29500 + obj = Series(np.random.randn(10)) + dummy = obj[:0] + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"): + libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy) + + def test_series_bin_grouper(): obj = Series(np.random.randn(10)) dummy = obj[:0] @@ -106,15 +116,16 @@ class TestMoments: class TestReducer: def test_int_index(self): arr = np.random.randn(100, 4) - result = libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) - expected = arr.sum(0) - tm.assert_almost_equal(result, expected) - result = libreduction.compute_reduction( - arr, np.sum, axis=1, labels=Index(np.arange(100)) - ) - expected = arr.sum(1) - tm.assert_almost_equal(result, expected) + msg = "Must pass either dummy and labels, or neither" + # we must pass either both labels and dummy, or neither + with pytest.raises(ValueError, match=msg): + libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) + + with pytest.raises(ValueError, match=msg): + libreduction.compute_reduction( + arr, np.sum, axis=1, labels=Index(np.arange(100)) + ) dummy = Series(0.0, index=np.arange(100)) result = libreduction.compute_reduction( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 22a23407b2521..5f78e4860f1e9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -781,6 +781,22 @@ def test_categorical_no_compress(): tm.assert_numpy_array_equal(result, exp) +def test_groupby_empty_with_category(): + # GH-9614 + # test fix for when group by on None resulted in + # coercion of dtype categorical -> float + df = pd.DataFrame( + {"A": [None] * 3, "B": pd.Categorical(["train", "train", "test"])} + ) + result = df.groupby("A").first()["B"] + expected = pd.Series( + pd.Categorical([], categories=["test", "train"]), + index=pd.Series([], dtype="object", name="A"), + name="B", + ) + tm.assert_series_equal(result, expected) + + def test_sort(): # http://stackoverflow.com/questions/23814368/sorting-pandas- @@ -1111,7 +1127,7 @@ def test_seriesgroupby_observed_true(df_cat, operation, kwargs): index = MultiIndex.from_frame( DataFrame( {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]}, - **kwargs + **kwargs, ) ) expected = Series(data=[1, 3, 2, 4], index=index, name="C") @@ -1236,3 +1252,82 @@ def test_get_nonexistent_category(): {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) ) + + +def test_series_groupby_on_2_categoricals_unobserved( + reduction_func: str, observed: bool +): + # GH 17605 + + if reduction_func == "ngroup": + pytest.skip("ngroup is not truly a reduction") + + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), + "value": [0.1] * 4, + } + ) + args = {"nth": [0]}.get(reduction_func, []) + + expected_length = 4 if observed else 16 + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"] + agg = getattr(series_groupby, reduction_func) + result = agg(*args) + + assert len(result) == expected_length + + +@pytest.mark.parametrize( + "func, zero_or_nan", + [ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", np.NaN), + ("var", np.NaN), + ], +) +def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan): + # GH 17605 + # Tests whether the unobserved categories in the result contain 0 or NaN + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + } + ) + unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")] + args = {"nth": [0]}.get(func, []) + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] + agg = getattr(series_groupby, func) + result = agg(*args) + + for idx in unobserved: + val = result.loc[idx] + assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + + # If we expect unobserved values to be zero, we also expect the dtype to be int + if zero_or_nan == 0: + assert np.issubdtype(result.dtype, np.integer) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 18c4d7ceddc65..c41f762e9128d 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1300,8 +1300,8 @@ def test_size_groupby_all_null(): ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), # Timestamps ( - [x for x in pd.date_range("1/1/18", freq="D", periods=5)], - [x for x in pd.date_range("1/1/18", freq="D", periods=5)][::-1], + list(pd.date_range("1/1/18", freq="D", periods=5)), + list(pd.date_range("1/1/18", freq="D", periods=5))[::-1], ), # All NA ([np.nan] * 5, [np.nan] * 5), diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e17181f55fdba..b848e9caad9be 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv +from pandas.core.base import SpecificationError import pandas.core.common as com import pandas.util.testing as tm @@ -55,8 +56,9 @@ def test_basic(dtype): # complex agg agged = grouped.aggregate([np.mean, np.std]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - agged = grouped.aggregate({"one": np.mean, "two": np.std}) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate({"one": np.mean, "two": np.std}) group_constants = {0: 10, 1: 20, 2: 30} agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) @@ -452,9 +454,9 @@ def test_frame_set_name_single(df): result = grouped["C"].agg([np.mean, np.std]) assert result.index.name == "A" - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped["C"].agg({"foo": np.mean, "bar": np.std}) - assert result.index.name == "A" + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"foo": np.mean, "bar": np.std}) def test_multi_func(df): @@ -602,12 +604,10 @@ def test_groupby_as_index_agg(df): tm.assert_frame_equal(result2, expected2) grouped = df.groupby("A", as_index=True) - expected3 = grouped["C"].sum() - expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result3 = grouped["C"].agg({"Q": np.sum}) - tm.assert_frame_equal(result3, expected3) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"Q": np.sum}) # multi-key @@ -1951,3 +1951,39 @@ def test_groupby_only_none_group(): expected = pd.Series([np.nan], name="x") tm.assert_series_equal(actual, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_bool_aggs_dup_column_labels(bool_agg_func): + # 21668 + df = pd.DataFrame([[True, True]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)() + + expected = df + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "idx", [pd.Index(["a", "a"]), pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")))] +) +def test_dup_labels_output_shape(groupby_func, idx): + if groupby_func in {"size", "ngroup", "cumcount"}: + pytest.skip("Not applicable") + + df = pd.DataFrame([[1, 1]], columns=idx) + grp_by = df.groupby([0]) + + args = [] + if groupby_func in {"fillna", "nth"}: + args.append(0) + elif groupby_func == "corrwith": + args.append(df) + elif groupby_func == "tshift": + df.index = [pd.Timestamp("today")] + args.extend([1, "D"]) + + result = getattr(grp_by, groupby_func)(*args) + + assert result.shape == (1, 2) + tm.assert_index_equal(result.columns, idx) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index db44a4a57230c..c46180c1d11cd 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -911,6 +911,41 @@ def test_pct_change(test_series, freq, periods, fill_method, limit): tm.assert_frame_equal(result, expected.to_frame("vals")) +@pytest.mark.parametrize( + "func, expected_status", + [ + ("ffill", ["shrt", "shrt", "lng", np.nan, "shrt", "ntrl", "ntrl"]), + ("bfill", ["shrt", "lng", "lng", "shrt", "shrt", "ntrl", np.nan]), + ], +) +def test_ffill_bfill_non_unique_multilevel(func, expected_status): + # GH 19437 + date = pd.to_datetime( + [ + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-02", + "2018-01-01", + "2018-01-02", + ] + ) + symbol = ["MSFT", "MSFT", "MSFT", "AAPL", "AAPL", "TSLA", "TSLA"] + status = ["shrt", np.nan, "lng", np.nan, "shrt", "ntrl", np.nan] + + df = DataFrame({"date": date, "symbol": symbol, "status": status}) + df = df.set_index(["date", "symbol"]) + result = getattr(df.groupby("symbol")["status"], func)() + + index = MultiIndex.from_tuples( + tuples=list(zip(*[date, symbol])), names=["date", "symbol"] + ) + expected = Series(expected_status, index=index, name="status") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 @@ -1073,3 +1108,33 @@ def test_transform_lambda_with_datetimetz(): name="time", ) tm.assert_series_equal(result, expected) + + +def test_transform_fastpath_raises(): + # GH#29631 case where fastpath defined in groupby.generic _choose_path + # raises, but slow_path does not + + df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) + gb = df.groupby("A") + + def func(grp): + # we want a function such that func(frame) fails but func.apply(frame) + # works + if grp.ndim == 2: + # Ensure that fast_path fails + raise NotImplementedError("Don't cross the streams") + return grp * 2 + + # Check that the fastpath raises, see _transform_general + obj = gb._obj_with_exclusions + gen = gb.grouper.get_iterator(obj, axis=gb.axis) + fast_path, slow_path = gb._define_paths(func) + _, group = next(gen) + + with pytest.raises(NotImplementedError, match="Don't cross the streams"): + fast_path(group) + + result = gb.transform(func) + + expected = pd.DataFrame([2, -2, 2, 4], columns=["B"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 1ac6370860ba6..c35c4c3568f74 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -31,7 +31,7 @@ class Base: """ base class for index sub-class tests """ - _holder = None # type: Optional[Type[Index]] + _holder: Optional[Type[Index]] = None _compat_props = ["shape", "ndim", "size", "nbytes"] def test_pickle_compat_construction(self): diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 1776538a15fc2..4a38e3a146c0e 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -456,3 +456,15 @@ def test_to_frame_datetime_tz(self): result = idx.to_frame() expected = DataFrame(idx, index=idx) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "name"]) + def test_index_map(self, name): + # see GH20990 + count = 6 + index = pd.date_range("2018-01-01", periods=count, freq="M", name=name).map( + lambda x: (x.year, x.month) + ) + exp_index = pd.MultiIndex.from_product( + ((2018,), range(1, 7)), names=[name, name] + ) + tm.assert_index_equal(index, exp_index) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 2ec267c66091b..2944767ba4c02 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -549,8 +549,6 @@ def test_shift_periods(self): idx = pd.date_range(start=START, end=END, periods=3) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - tm.assert_index_equal(idx.shift(n=0), idx) def test_pickle_unpickle(self): unpickled = tm.round_trip_pickle(self.rng) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 73eacd8c4856e..f3c8c5cb6efa1 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -105,11 +105,11 @@ def test_with_nans(self, closed): assert index.hasnans is False result = index.isna() - expected = np.repeat(False, len(index)) + expected = np.zeros(len(index), dtype=bool) tm.assert_numpy_array_equal(result, expected) result = index.notna() - expected = np.repeat(True, len(index)) + expected = np.ones(len(index), dtype=bool) tm.assert_numpy_array_equal(result, expected) index = self.create_index_with_nan(closed=closed) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index c32adf275ac98..d2c95b12d5339 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -609,12 +609,11 @@ def test_create_index_existing_name(idx): ("qux", "two"), ], dtype="object", - ), - names=["foo", "bar"], + ) ) tm.assert_index_equal(result, expected) - result = pd.Index(index, names=["A", "B"]) + result = pd.Index(index, name="A") expected = Index( Index( [ @@ -627,7 +626,7 @@ def test_create_index_existing_name(idx): ], dtype="object", ), - names=["A", "B"], + name="A", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index f61ba0132ab97..c81af5a0c6c49 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -146,7 +146,10 @@ def test_identical(idx): assert mi.identical(mi2) mi3 = Index(mi.tolist(), names=mi.names) - mi4 = Index(mi.tolist(), names=mi.names, tupleize_cols=False) + msg = r"Unexpected keyword arguments {'names'}" + with pytest.raises(TypeError, match=msg): + Index(mi.tolist(), names=mi.names, tupleize_cols=False) + mi4 = Index(mi.tolist(), tupleize_cols=False) assert mi.identical(mi3) assert not mi.identical(mi4) assert mi.equals(mi4) diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index 80e4b1fe1e430..f8274a82f1b6f 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -117,5 +117,3 @@ def test_shift_periods(self): idx = period_range(freq="A", start="1/1/2001", end="12/1/2009") tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - tm.assert_index_equal(idx.shift(n=0), idx) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 8c75fbbae7de3..1973cb7f4740d 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -434,7 +434,7 @@ def test_constructor_range_based_deprecated_different_freq(self): with tm.assert_produces_warning(FutureWarning) as m: PeriodIndex(start="2000", periods=2) - warning, = m + (warning,) = m assert 'freq="A-DEC"' in str(warning.message) def test_constructor(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e43d340a46d9f..15844df5d7b04 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -11,7 +11,6 @@ import pandas._config.config as cf from pandas._libs.tslib import Timestamp -from pandas.compat import PY36 from pandas.compat.numpy import np_datetime64_compat from pandas.core.dtypes.common import is_unsigned_integer_dtype @@ -34,12 +33,8 @@ period_range, ) from pandas.core.algorithms import safe_sort -from pandas.core.index import ( - _get_combined_index, - ensure_index, - ensure_index_from_sequences, -) -from pandas.core.indexes.api import Index, MultiIndex +from pandas.core.index import ensure_index, ensure_index_from_sequences +from pandas.core.indexes.api import Index, MultiIndex, _get_combined_index from pandas.tests.indexes.common import Base from pandas.tests.indexes.conftest import indices_dict import pandas.util.testing as tm @@ -355,6 +350,11 @@ def test_constructor_simple_new(self, vals, dtype): result = index._simple_new(index.values, dtype) tm.assert_index_equal(result, index) + def test_constructor_wrong_kwargs(self): + # GH #19348 + with pytest.raises(TypeError, match="Unexpected keyword arguments {'foo'}"): + Index([], foo="bar") + @pytest.mark.parametrize( "vals", [ @@ -730,7 +730,7 @@ def test_nanosecond_index_access(self): assert first_value == x[Timestamp(expected_ts)] def test_booleanindex(self, index): - bool_index = np.repeat(True, len(index)).astype(bool) + bool_index = np.ones(len(index), dtype=bool) bool_index[5:30:2] = False sub_index = index[bool_index] @@ -1385,13 +1385,6 @@ def test_summary_bug(self): assert "~:{range}:0" in result assert "{other}%s" in result - # GH18217 - def test_summary_deprecated(self): - ind = Index(["{other}%s", "~:{range}:0"], name="A") - - with tm.assert_produces_warning(FutureWarning): - ind.summary() - def test_format(self, indices): self._check_method_works(Index.format, indices) @@ -1616,11 +1609,7 @@ def test_get_loc(self, method): def test_get_loc_raises_bad_label(self, method): index = pd.Index([0, 1, 2]) if method: - # Messages vary across versions - if PY36: - msg = "not supported between" - else: - msg = "unorderable types" + msg = "not supported between" else: msg = "invalid key" @@ -1837,7 +1826,7 @@ def test_drop_tuple(self, values, to_drop): tm.assert_index_equal(result, expected) removed = index.drop(to_drop[1]) - msg = r"\"\[{}\] not found in axis\"".format(re.escape(to_drop[1].__repr__())) + msg = fr"\"\[{re.escape(to_drop[1].__repr__())}\] not found in axis\"" for drop_me in to_drop[1], [to_drop[1]]: with pytest.raises(KeyError, match=msg): removed.drop(drop_me) @@ -2005,11 +1994,11 @@ def test_isin_level_kwarg_bad_label_raises(self, label, indices): index = indices if isinstance(index, MultiIndex): index = index.rename(["foo", "bar"]) - msg = "'Level {} not found'" + msg = f"'Level {label} not found'" else: index = index.rename("foo") - msg = r"Requested level \({}\) does not match index name \(foo\)" - with pytest.raises(KeyError, match=msg.format(label)): + msg = fr"Requested level \({label}\) does not match index name \(foo\)" + with pytest.raises(KeyError, match=msg): index.isin([], level=label) @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) @@ -2444,21 +2433,13 @@ def create_index(self): def test_argsort(self): index = self.create_index() - if PY36: - with pytest.raises(TypeError, match="'>|<' not supported"): - index.argsort() - else: - with pytest.raises(TypeError, match="unorderable types"): - index.argsort() + with pytest.raises(TypeError, match="'>|<' not supported"): + index.argsort() def test_numpy_argsort(self): index = self.create_index() - if PY36: - with pytest.raises(TypeError, match="'>|<' not supported"): - np.argsort(index) - else: - with pytest.raises(TypeError, match="unorderable types"): - np.argsort(index) + with pytest.raises(TypeError, match="'>|<' not supported"): + np.argsort(index) def test_copy_name(self): # Check that "name" argument passed at initialization is honoured @@ -2768,7 +2749,7 @@ def test_generated_op_names(opname, indices): # pd.Index.__rsub__ does not exist; though the method does exist # for subclasses. see GH#19723 return - opname = "__{name}__".format(name=opname) + opname = f"__{opname}__" method = getattr(indices, opname) assert method.__name__ == opname diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 61d9d1d70c360..84f98a55376f7 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -188,8 +188,8 @@ def test_disallow_set_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(["a", "b"])) - msg = "cannot perform {} with this index type: CategoricalIndex" - with pytest.raises(TypeError, match=msg.format(op_name)): + msg = f"cannot perform {op_name} with this index type: CategoricalIndex" + with pytest.raises(TypeError, match=msg): func(idx) def test_method_delegation(self): diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index ae1a21e9b3980..558ba04b657a1 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -167,7 +167,7 @@ def test_dtype_str(self, indices): def test_hash_error(self, indices): index = indices with pytest.raises( - TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) + TypeError, match=(f"unhashable type: {type(index).__name__!r}") ): hash(indices) @@ -201,8 +201,9 @@ def test_unique(self, indices): with pytest.raises(IndexError, match=msg): indices.unique(level=3) - msg = r"Requested level \(wrong\) does not match index name \({}\)".format( - re.escape(indices.name.__repr__()) + msg = ( + fr"Requested level \(wrong\) does not match index name " + fr"\({re.escape(indices.name.__repr__())}\)" ) with pytest.raises(KeyError, match=msg): indices.unique(level="wrong") diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index 712feb7b8ef61..c7b219b5ee890 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -112,5 +112,4 @@ def test_searchsorted(self): expected = 2 assert self.container.searchsorted(7) == expected - with tm.assert_produces_warning(FutureWarning): - assert self.container.searchsorted(v=7) == expected + assert self.container.searchsorted(value=7) == expected diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index e424b3601a4b2..6ee1ce5c4f2ad 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -167,6 +167,23 @@ def test_constructor(self): result = Index(np.array([np.nan])) assert pd.isna(result.values).all() + @pytest.mark.parametrize( + "index, dtype", + [ + (pd.Int64Index, "float64"), + (pd.UInt64Index, "categorical"), + (pd.Float64Index, "datetime64"), + (pd.RangeIndex, "float64"), + ], + ) + def test_invalid_dtype(self, index, dtype): + # GH 29539 + with pytest.raises( + ValueError, + match=rf"Incorrect `dtype` passed: expected \w+(?: \w+)?, received {dtype}", + ): + index([1, 2, 3], dtype=dtype) + def test_constructor_invalid(self): # invalid @@ -245,9 +262,9 @@ def test_astype(self, mixed_index, float_index): # invalid for dtype in ["M8[ns]", "m8[ns]"]: msg = ( - "Cannot convert Float64Index to dtype {}; integer values" - " are required for conversion" - ).format(pandas_dtype(dtype)) + f"Cannot convert Float64Index to dtype {pandas_dtype(dtype)}; " + f"integer values are required for conversion" + ) with pytest.raises(TypeError, match=re.escape(msg)): i.astype(dtype) @@ -588,7 +605,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) name = self._holder.__name__ - msg = "Unable to fill values because {name} cannot contain NA".format(name=name) + msg = f"Unable to fill values because {name} cannot contain NA" # fill_value=True with pytest.raises(ValueError, match=msg): @@ -944,6 +961,11 @@ def test_constructor(self): res = Index(np.array([-1, 2 ** 63], dtype=object)) tm.assert_index_equal(res, idx) + # https://github.com/pandas-dev/pandas/issues/29526 + idx = UInt64Index([1, 2 ** 63 + 1], dtype=np.uint64) + res = Index([1, 2 ** 63 + 1], dtype=np.uint64) + tm.assert_index_equal(res, idx) + def test_get_indexer(self, index_large): target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) indexer = index_large.get_indexer(target) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index fa64e1bacb2e5..b60d3126da1d5 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -110,7 +110,10 @@ def test_constructor_same(self): result = RangeIndex(index) tm.assert_index_equal(result, index, exact=True) - with pytest.raises(TypeError): + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): RangeIndex(index, dtype="float64") def test_constructor_range(self): @@ -140,7 +143,10 @@ def test_constructor_range(self): expected = RangeIndex(1, 5, 2) tm.assert_index_equal(result, expected, exact=True) - with pytest.raises(TypeError): + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): Index(range(1, 5, 2), dtype="float64") msg = r"^from_range\(\) got an unexpected keyword argument" with pytest.raises(TypeError, match=msg): @@ -178,7 +184,10 @@ def test_constructor_corner(self): RangeIndex(1.1, 10.2, 1.3) # invalid passed type - with pytest.raises(TypeError): + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): RangeIndex(1, 5, dtype="float64") @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 861067480b5fa..bbdd6c8c7c017 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -39,7 +39,7 @@ def test_union_bug_1730(self): rng_b = timedelta_range("1 day", periods=4, freq="4H") result = rng_a.union(rng_b) - exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) + exp = TimedeltaIndex(sorted(set(rng_a) | set(rng_b))) tm.assert_index_equal(result, exp) def test_union_bug_1745(self): @@ -50,7 +50,7 @@ def test_union_bug_1745(self): ) result = left.union(right) - exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + exp = TimedeltaIndex(sorted(set(left) | set(right))) tm.assert_index_equal(result, exp) def test_union_bug_4564(self): @@ -59,7 +59,7 @@ def test_union_bug_4564(self): right = left + pd.offsets.Minute(15) result = left.union(right) - exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + exp = TimedeltaIndex(sorted(set(left) | set(right))) tm.assert_index_equal(result, exp) def test_intersection_bug_1708(self): diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 4f95e6bd28989..519a1eb5b16d8 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -108,7 +108,7 @@ def test_series_getitem_indexing_errors( def test_series_getitem_corner_generator( - multiindex_year_month_day_dataframe_random_data + multiindex_year_month_day_dataframe_random_data, ): s = multiindex_year_month_day_dataframe_random_data["A"] result = s[(x > 0 for x in s)] diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 9eeee897bfbb5..76425c72ce4f9 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -1,5 +1,3 @@ -import itertools - import numpy as np import pytest @@ -223,17 +221,13 @@ def test_loc_getitem_int_slice(self): # GH 3053 # loc should treat integer slices like label slices - index = MultiIndex.from_tuples( - [t for t in itertools.product([6, 7, 8], ["a", "b"])] - ) + index = MultiIndex.from_product([[6, 7, 8], ["a", "b"]]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] expected = df tm.assert_frame_equal(result, expected) - index = MultiIndex.from_tuples( - [t for t in itertools.product([10, 20, 30], ["a", "b"])] - ) + index = MultiIndex.from_product([[10, 20, 30], ["a", "b"]]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] expected = df.iloc[2:] diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index f1f11285696f9..f279b5517c3f6 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -492,6 +492,44 @@ def test_loc_axis_arguments(self): with pytest.raises(ValueError): df.loc(axis="foo")[:, :, ["C1", "C3"]] + def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self): + + # GH29519 + df = pd.DataFrame( + np.arange(27).reshape(3, 9), + columns=pd.MultiIndex.from_product( + [["a1", "a2", "a3"], ["b1", "b2", "b3"]] + ), + ) + result = df.loc(axis=1)["a1":"a2"] + expected = df.iloc[:, :-3] + + tm.assert_frame_equal(result, expected) + + def test_loc_axis_single_level_single_col_indexing_multiindex_col_df(self): + + # GH29519 + df = pd.DataFrame( + np.arange(27).reshape(3, 9), + columns=pd.MultiIndex.from_product( + [["a1", "a2", "a3"], ["b1", "b2", "b3"]] + ), + ) + result = df.loc(axis=1)["a1"] + expected = df.iloc[:, :3] + expected.columns = ["b1", "b2", "b3"] + + tm.assert_frame_equal(result, expected) + + def test_loc_ax_single_level_indexer_simple_df(self): + + # GH29519 + # test single level indexing on single index column data frame + df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=["a", "b", "c"]) + result = df.loc(axis=1)["a"] + expected = pd.Series(np.array([0, 3, 6]), name="a") + tm.assert_series_equal(result, expected) + def test_per_axis_per_level_setitem(self): # test index maker diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index 99f343c2f4a7d..ffbe1bb785cda 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -1,5 +1,3 @@ -from itertools import product - import numpy as np import pytest @@ -159,10 +157,8 @@ def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe): def test_xs_integer_key(): # see gh-2107 dates = range(20111201, 20111205) - ids = "abcde" - index = MultiIndex.from_tuples( - [x for x in product(dates, ids)], names=["date", "secid"] - ) + ids = list("abcde") + index = MultiIndex.from_product([dates, ids], names=["date", "secid"]) df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"]) result = df.xs(20111201, level="date") @@ -211,7 +207,7 @@ def test_xs_level_series_ymd(multiindex_year_month_day_dataframe_random_data): def test_xs_level_series_slice_not_implemented( - multiindex_year_month_day_dataframe_random_data + multiindex_year_month_day_dataframe_random_data, ): # this test is not explicitly testing .xs functionality # TODO: move to another module or refactor diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index aa73bd728595f..81dedfdc74409 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -17,10 +17,14 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[lambda x: x.A > 2,] # noqa: E231 + res = df.loc[ + lambda x: x.A > 2, + ] # noqa: E231 tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 - res = df.loc[lambda x: x.A > 2,] # noqa: E231 + res = df.loc[ + lambda x: x.A > 2, + ] # noqa: E231 tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 res = df.loc[lambda x: x.B == "b", :] @@ -90,7 +94,9 @@ def test_frame_loc_callable_labels(self): res = df.loc[lambda x: ["A", "C"]] tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[lambda x: ["A", "C"],] # noqa: E231 + res = df.loc[ + lambda x: ["A", "C"], + ] # noqa: E231 tm.assert_frame_equal(res, df.loc[["A", "C"],]) # noqa: E231 res = df.loc[lambda x: ["A", "C"], :] diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 6c81a00cb8f34..ab3b0ed13b5c0 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -472,7 +472,7 @@ def test_getitem_with_listlike(self): [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats ) dummies = pd.get_dummies(cats) - result = dummies[[c for c in dummies.columns]] + result = dummies[list(dummies.columns)] tm.assert_frame_equal(result, expected) def test_setitem_listlike(self): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 274b72b0561a9..6e26d407ab0ec 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -393,14 +393,3 @@ def test_cache_updating(self): tm.assert_frame_equal(df, expected) expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) - - def test_deprecate_is_copy(self): - # GH18801 - df = DataFrame({"A": [1, 2, 3]}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # getter - df.is_copy - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # setter - df.is_copy = "test deprecated is_copy" diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 4f38d7beb9c0b..8b29cf3813d13 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -479,22 +479,20 @@ def test_insert_index_period(self, insert, coerced_val, coerced_dtype): obj = pd.PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq="M") assert obj.dtype == "period[M]" + data = [ + pd.Period("2011-01", freq="M"), + coerced_val, + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ] if isinstance(insert, pd.Period): - index_type = pd.PeriodIndex + exp = pd.PeriodIndex(data, freq="M") + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) else: - index_type = pd.Index - - exp = index_type( - [ - pd.Period("2011-01", freq="M"), - coerced_val, - pd.Period("2011-02", freq="M"), - pd.Period("2011-03", freq="M"), - pd.Period("2011-04", freq="M"), - ], - freq="M", - ) - self._assert_insert_conversion(obj, insert, exp, coerced_dtype) + msg = r"Unexpected keyword arguments {'freq'}" + with pytest.raises(TypeError, match=msg): + pd.Index(data, freq="M") def test_insert_index_complex128(self): pass @@ -929,7 +927,7 @@ class TestReplaceSeriesCoercion(CoercionBase): klasses = ["series"] method = "replace" - rep = {} # type: Dict[str, List] + rep: Dict[str, List] = {} rep["object"] = ["a", "b"] rep["int64"] = [4, 5] rep["float64"] = [1.1, 2.2] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d6d3763981131..fc5753ec2955c 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -7,7 +7,6 @@ import numpy as np import pytest -from pandas.compat import PY36 from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype @@ -222,7 +221,7 @@ def test_setitem_dtype_upcast(self): expected = DataFrame( [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}] ) - tm.assert_frame_equal(df, expected, check_like=not PY36) + tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ee7fca6ec7672..abe2ddf955ad8 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1,10 +1,8 @@ from collections import OrderedDict from datetime import date, datetime -from distutils.version import LooseVersion import itertools import operator import re -import sys import numpy as np import pytest @@ -26,9 +24,6 @@ from pandas.core.internals import BlockManager, SingleBlockManager, make_block import pandas.util.testing as tm -# in 3.6.1 a c-api slicing function changed, see src/compat_helper.h -PY361 = LooseVersion(sys.version) >= LooseVersion("3.6.1") - @pytest.fixture def mgr(): @@ -139,7 +134,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): arr = values.sp_values.view() arr += num_offset - 1 else: - raise ValueError('Unsupported typestr: "%s"' % typestr) + raise ValueError(f'Unsupported typestr: "{typestr}"') return make_block(values, placement=placement, ndim=len(shape)) @@ -313,12 +308,6 @@ def test_delete(self): with pytest.raises(Exception): newb.delete(3) - def test_make_block_same_class(self): - # issue 19431 - block = create_block("M8[ns, US/Eastern]", [3]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - block.make_block_same_class(block.values, dtype=block.values.dtype) - class TestDatetimeBlock: def test_can_hold_element(self): @@ -1096,10 +1085,6 @@ def assert_as_slice_equals(arr, slc): assert_as_slice_equals([2, 1], slice(2, 0, -1)) - if not PY361: - assert_as_slice_equals([2, 1, 0], slice(2, None, -1)) - assert_as_slice_equals([100, 0], slice(100, None, -100)) - def test_not_slice_like_arrays(self): def assert_not_slice_like(arr): assert not BlockPlacement(arr).is_slice_like @@ -1119,10 +1104,6 @@ def test_slice_iter(self): assert list(BlockPlacement(slice(0, 0))) == [] assert list(BlockPlacement(slice(3, 0))) == [] - if not PY361: - assert list(BlockPlacement(slice(3, 0, -1))) == [3, 2, 1] - assert list(BlockPlacement(slice(3, None, -1))) == [3, 2, 1, 0] - def test_slice_to_array_conversion(self): def assert_as_array_equals(slc, asarray): tm.assert_numpy_array_equal( @@ -1135,10 +1116,6 @@ def assert_as_array_equals(slc, asarray): assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) - if not PY361: - assert_as_array_equals(slice(3, None, -1), [3, 2, 1, 0]) - assert_as_array_equals(slice(31, None, -10), [31, 21, 11, 1]) - def test_blockplacement_add(self): bpl = BlockPlacement(slice(0, 5)) assert bpl.add(1).as_slice == slice(1, 6, 1) @@ -1168,14 +1145,6 @@ def assert_add_equals(val, inc, result): with pytest.raises(ValueError): BlockPlacement([1, 2, 4]).add(-10) - if not PY361: - assert_add_equals(slice(3, 0, -1), -1, [2, 1, 0]) - assert_add_equals(slice(2, None, -1), 0, [2, 1, 0]) - assert_add_equals(slice(2, None, -1), 10, [12, 11, 10]) - - with pytest.raises(ValueError): - BlockPlacement(slice(2, None, -1)).add(-1) - class DummyElement: def __init__(self, value, dtype): @@ -1280,13 +1249,6 @@ def test_holder(typestr, holder): assert blk._holder is holder -def test_deprecated_fastpath(): - # GH#19265 - values = np.random.rand(3, 3) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - make_block(values, placement=np.arange(3), fastpath=True) - - def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = slice(2) @@ -1322,3 +1284,10 @@ def test_make_block_no_pandas_array(): result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False + + +def test_dataframe_not_equal(): + # see GH28839 + df1 = pd.DataFrame({"a": [1, 2], "b": ["s", "d"]}) + df2 = pd.DataFrame({"a": ["s", "d"], "b": [1, 2]}) + assert df1.equals(df2) is False diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 7b6b9b6380a36..3f034107ef24f 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -40,7 +40,7 @@ def s3_resource(tips_file, jsonl_file): A private bucket "cant_get_it" is also created. The boto3 s3 resource is yielded by the fixture. """ - pytest.importorskip("s3fs") + s3fs = pytest.importorskip("s3fs") boto3 = pytest.importorskip("boto3") with tm.ensure_safe_environment_variables(): @@ -77,6 +77,7 @@ def add_tips_files(bucket_name): conn.create_bucket(Bucket="cant_get_it", ACL="private") add_tips_files("cant_get_it") + s3fs.S3FileSystem.clear_instance_cache() yield conn finally: s3.stop() diff --git a/pandas/tests/io/data/banklist.csv b/pandas/tests/io/data/csv/banklist.csv similarity index 100% rename from pandas/tests/io/data/banklist.csv rename to pandas/tests/io/data/csv/banklist.csv diff --git a/pandas/tests/io/data/iris.csv b/pandas/tests/io/data/csv/iris.csv similarity index 100% rename from pandas/tests/io/data/iris.csv rename to pandas/tests/io/data/csv/iris.csv diff --git a/pandas/tests/io/data/test1.csv b/pandas/tests/io/data/csv/test1.csv similarity index 100% rename from pandas/tests/io/data/test1.csv rename to pandas/tests/io/data/csv/test1.csv diff --git a/pandas/tests/io/data/test_mmap.csv b/pandas/tests/io/data/csv/test_mmap.csv similarity index 100% rename from pandas/tests/io/data/test_mmap.csv rename to pandas/tests/io/data/csv/test_mmap.csv diff --git a/pandas/tests/io/data/tips.csv b/pandas/tests/io/data/csv/tips.csv similarity index 100% rename from pandas/tests/io/data/tips.csv rename to pandas/tests/io/data/csv/tips.csv diff --git a/pandas/tests/io/data/blank.ods b/pandas/tests/io/data/excel/blank.ods similarity index 100% rename from pandas/tests/io/data/blank.ods rename to pandas/tests/io/data/excel/blank.ods diff --git a/pandas/tests/io/data/blank.xls b/pandas/tests/io/data/excel/blank.xls similarity index 100% rename from pandas/tests/io/data/blank.xls rename to pandas/tests/io/data/excel/blank.xls diff --git a/pandas/tests/io/data/blank.xlsm b/pandas/tests/io/data/excel/blank.xlsm similarity index 100% rename from pandas/tests/io/data/blank.xlsm rename to pandas/tests/io/data/excel/blank.xlsm diff --git a/pandas/tests/io/data/blank.xlsx b/pandas/tests/io/data/excel/blank.xlsx similarity index 100% rename from pandas/tests/io/data/blank.xlsx rename to pandas/tests/io/data/excel/blank.xlsx diff --git a/pandas/tests/io/data/blank_with_header.ods b/pandas/tests/io/data/excel/blank_with_header.ods similarity index 100% rename from pandas/tests/io/data/blank_with_header.ods rename to pandas/tests/io/data/excel/blank_with_header.ods diff --git a/pandas/tests/io/data/blank_with_header.xls b/pandas/tests/io/data/excel/blank_with_header.xls similarity index 100% rename from pandas/tests/io/data/blank_with_header.xls rename to pandas/tests/io/data/excel/blank_with_header.xls diff --git a/pandas/tests/io/data/blank_with_header.xlsm b/pandas/tests/io/data/excel/blank_with_header.xlsm similarity index 100% rename from pandas/tests/io/data/blank_with_header.xlsm rename to pandas/tests/io/data/excel/blank_with_header.xlsm diff --git a/pandas/tests/io/data/blank_with_header.xlsx b/pandas/tests/io/data/excel/blank_with_header.xlsx similarity index 100% rename from pandas/tests/io/data/blank_with_header.xlsx rename to pandas/tests/io/data/excel/blank_with_header.xlsx diff --git a/pandas/tests/io/data/invalid_value_type.ods b/pandas/tests/io/data/excel/invalid_value_type.ods similarity index 100% rename from pandas/tests/io/data/invalid_value_type.ods rename to pandas/tests/io/data/excel/invalid_value_type.ods diff --git a/pandas/tests/io/data/test1.ods b/pandas/tests/io/data/excel/test1.ods similarity index 100% rename from pandas/tests/io/data/test1.ods rename to pandas/tests/io/data/excel/test1.ods diff --git a/pandas/tests/io/data/test1.xls b/pandas/tests/io/data/excel/test1.xls similarity index 100% rename from pandas/tests/io/data/test1.xls rename to pandas/tests/io/data/excel/test1.xls diff --git a/pandas/tests/io/data/test1.xlsm b/pandas/tests/io/data/excel/test1.xlsm similarity index 100% rename from pandas/tests/io/data/test1.xlsm rename to pandas/tests/io/data/excel/test1.xlsm diff --git a/pandas/tests/io/data/test1.xlsx b/pandas/tests/io/data/excel/test1.xlsx similarity index 100% rename from pandas/tests/io/data/test1.xlsx rename to pandas/tests/io/data/excel/test1.xlsx diff --git a/pandas/tests/io/data/test2.ods b/pandas/tests/io/data/excel/test2.ods similarity index 100% rename from pandas/tests/io/data/test2.ods rename to pandas/tests/io/data/excel/test2.ods diff --git a/pandas/tests/io/data/test2.xls b/pandas/tests/io/data/excel/test2.xls similarity index 100% rename from pandas/tests/io/data/test2.xls rename to pandas/tests/io/data/excel/test2.xls diff --git a/pandas/tests/io/data/test2.xlsm b/pandas/tests/io/data/excel/test2.xlsm similarity index 100% rename from pandas/tests/io/data/test2.xlsm rename to pandas/tests/io/data/excel/test2.xlsm diff --git a/pandas/tests/io/data/test2.xlsx b/pandas/tests/io/data/excel/test2.xlsx similarity index 100% rename from pandas/tests/io/data/test2.xlsx rename to pandas/tests/io/data/excel/test2.xlsx diff --git a/pandas/tests/io/data/test3.ods b/pandas/tests/io/data/excel/test3.ods similarity index 100% rename from pandas/tests/io/data/test3.ods rename to pandas/tests/io/data/excel/test3.ods diff --git a/pandas/tests/io/data/test3.xls b/pandas/tests/io/data/excel/test3.xls similarity index 100% rename from pandas/tests/io/data/test3.xls rename to pandas/tests/io/data/excel/test3.xls diff --git a/pandas/tests/io/data/test3.xlsm b/pandas/tests/io/data/excel/test3.xlsm similarity index 100% rename from pandas/tests/io/data/test3.xlsm rename to pandas/tests/io/data/excel/test3.xlsm diff --git a/pandas/tests/io/data/test3.xlsx b/pandas/tests/io/data/excel/test3.xlsx similarity index 100% rename from pandas/tests/io/data/test3.xlsx rename to pandas/tests/io/data/excel/test3.xlsx diff --git a/pandas/tests/io/data/test4.ods b/pandas/tests/io/data/excel/test4.ods similarity index 100% rename from pandas/tests/io/data/test4.ods rename to pandas/tests/io/data/excel/test4.ods diff --git a/pandas/tests/io/data/test4.xls b/pandas/tests/io/data/excel/test4.xls similarity index 100% rename from pandas/tests/io/data/test4.xls rename to pandas/tests/io/data/excel/test4.xls diff --git a/pandas/tests/io/data/test4.xlsm b/pandas/tests/io/data/excel/test4.xlsm similarity index 100% rename from pandas/tests/io/data/test4.xlsm rename to pandas/tests/io/data/excel/test4.xlsm diff --git a/pandas/tests/io/data/test4.xlsx b/pandas/tests/io/data/excel/test4.xlsx similarity index 100% rename from pandas/tests/io/data/test4.xlsx rename to pandas/tests/io/data/excel/test4.xlsx diff --git a/pandas/tests/io/data/test5.ods b/pandas/tests/io/data/excel/test5.ods similarity index 100% rename from pandas/tests/io/data/test5.ods rename to pandas/tests/io/data/excel/test5.ods diff --git a/pandas/tests/io/data/test5.xls b/pandas/tests/io/data/excel/test5.xls similarity index 100% rename from pandas/tests/io/data/test5.xls rename to pandas/tests/io/data/excel/test5.xls diff --git a/pandas/tests/io/data/test5.xlsm b/pandas/tests/io/data/excel/test5.xlsm similarity index 100% rename from pandas/tests/io/data/test5.xlsm rename to pandas/tests/io/data/excel/test5.xlsm diff --git a/pandas/tests/io/data/test5.xlsx b/pandas/tests/io/data/excel/test5.xlsx similarity index 100% rename from pandas/tests/io/data/test5.xlsx rename to pandas/tests/io/data/excel/test5.xlsx diff --git a/pandas/tests/io/data/test_converters.ods b/pandas/tests/io/data/excel/test_converters.ods similarity index 100% rename from pandas/tests/io/data/test_converters.ods rename to pandas/tests/io/data/excel/test_converters.ods diff --git a/pandas/tests/io/data/test_converters.xls b/pandas/tests/io/data/excel/test_converters.xls similarity index 100% rename from pandas/tests/io/data/test_converters.xls rename to pandas/tests/io/data/excel/test_converters.xls diff --git a/pandas/tests/io/data/test_converters.xlsm b/pandas/tests/io/data/excel/test_converters.xlsm similarity index 100% rename from pandas/tests/io/data/test_converters.xlsm rename to pandas/tests/io/data/excel/test_converters.xlsm diff --git a/pandas/tests/io/data/test_converters.xlsx b/pandas/tests/io/data/excel/test_converters.xlsx similarity index 100% rename from pandas/tests/io/data/test_converters.xlsx rename to pandas/tests/io/data/excel/test_converters.xlsx diff --git a/pandas/tests/io/data/test_index_name_pre17.ods b/pandas/tests/io/data/excel/test_index_name_pre17.ods similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.ods rename to pandas/tests/io/data/excel/test_index_name_pre17.ods diff --git a/pandas/tests/io/data/test_index_name_pre17.xls b/pandas/tests/io/data/excel/test_index_name_pre17.xls similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.xls rename to pandas/tests/io/data/excel/test_index_name_pre17.xls diff --git a/pandas/tests/io/data/test_index_name_pre17.xlsm b/pandas/tests/io/data/excel/test_index_name_pre17.xlsm similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.xlsm rename to pandas/tests/io/data/excel/test_index_name_pre17.xlsm diff --git a/pandas/tests/io/data/test_index_name_pre17.xlsx b/pandas/tests/io/data/excel/test_index_name_pre17.xlsx similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.xlsx rename to pandas/tests/io/data/excel/test_index_name_pre17.xlsx diff --git a/pandas/tests/io/data/test_multisheet.ods b/pandas/tests/io/data/excel/test_multisheet.ods similarity index 100% rename from pandas/tests/io/data/test_multisheet.ods rename to pandas/tests/io/data/excel/test_multisheet.ods diff --git a/pandas/tests/io/data/test_multisheet.xls b/pandas/tests/io/data/excel/test_multisheet.xls similarity index 100% rename from pandas/tests/io/data/test_multisheet.xls rename to pandas/tests/io/data/excel/test_multisheet.xls diff --git a/pandas/tests/io/data/test_multisheet.xlsm b/pandas/tests/io/data/excel/test_multisheet.xlsm similarity index 100% rename from pandas/tests/io/data/test_multisheet.xlsm rename to pandas/tests/io/data/excel/test_multisheet.xlsm diff --git a/pandas/tests/io/data/test_multisheet.xlsx b/pandas/tests/io/data/excel/test_multisheet.xlsx similarity index 100% rename from pandas/tests/io/data/test_multisheet.xlsx rename to pandas/tests/io/data/excel/test_multisheet.xlsx diff --git a/pandas/tests/io/data/test_squeeze.ods b/pandas/tests/io/data/excel/test_squeeze.ods similarity index 100% rename from pandas/tests/io/data/test_squeeze.ods rename to pandas/tests/io/data/excel/test_squeeze.ods diff --git a/pandas/tests/io/data/test_squeeze.xls b/pandas/tests/io/data/excel/test_squeeze.xls similarity index 100% rename from pandas/tests/io/data/test_squeeze.xls rename to pandas/tests/io/data/excel/test_squeeze.xls diff --git a/pandas/tests/io/data/test_squeeze.xlsm b/pandas/tests/io/data/excel/test_squeeze.xlsm similarity index 100% rename from pandas/tests/io/data/test_squeeze.xlsm rename to pandas/tests/io/data/excel/test_squeeze.xlsm diff --git a/pandas/tests/io/data/test_squeeze.xlsx b/pandas/tests/io/data/excel/test_squeeze.xlsx similarity index 100% rename from pandas/tests/io/data/test_squeeze.xlsx rename to pandas/tests/io/data/excel/test_squeeze.xlsx diff --git a/pandas/tests/io/data/test_types.ods b/pandas/tests/io/data/excel/test_types.ods similarity index 100% rename from pandas/tests/io/data/test_types.ods rename to pandas/tests/io/data/excel/test_types.ods diff --git a/pandas/tests/io/data/test_types.xls b/pandas/tests/io/data/excel/test_types.xls similarity index 100% rename from pandas/tests/io/data/test_types.xls rename to pandas/tests/io/data/excel/test_types.xls diff --git a/pandas/tests/io/data/test_types.xlsm b/pandas/tests/io/data/excel/test_types.xlsm similarity index 100% rename from pandas/tests/io/data/test_types.xlsm rename to pandas/tests/io/data/excel/test_types.xlsm diff --git a/pandas/tests/io/data/test_types.xlsx b/pandas/tests/io/data/excel/test_types.xlsx similarity index 100% rename from pandas/tests/io/data/test_types.xlsx rename to pandas/tests/io/data/excel/test_types.xlsx diff --git a/pandas/tests/io/data/testdateoverflow.ods b/pandas/tests/io/data/excel/testdateoverflow.ods similarity index 100% rename from pandas/tests/io/data/testdateoverflow.ods rename to pandas/tests/io/data/excel/testdateoverflow.ods diff --git a/pandas/tests/io/data/testdateoverflow.xls b/pandas/tests/io/data/excel/testdateoverflow.xls similarity index 100% rename from pandas/tests/io/data/testdateoverflow.xls rename to pandas/tests/io/data/excel/testdateoverflow.xls diff --git a/pandas/tests/io/data/testdateoverflow.xlsm b/pandas/tests/io/data/excel/testdateoverflow.xlsm similarity index 100% rename from pandas/tests/io/data/testdateoverflow.xlsm rename to pandas/tests/io/data/excel/testdateoverflow.xlsm diff --git a/pandas/tests/io/data/testdateoverflow.xlsx b/pandas/tests/io/data/excel/testdateoverflow.xlsx similarity index 100% rename from pandas/tests/io/data/testdateoverflow.xlsx rename to pandas/tests/io/data/excel/testdateoverflow.xlsx diff --git a/pandas/tests/io/data/testdtype.ods b/pandas/tests/io/data/excel/testdtype.ods similarity index 100% rename from pandas/tests/io/data/testdtype.ods rename to pandas/tests/io/data/excel/testdtype.ods diff --git a/pandas/tests/io/data/testdtype.xls b/pandas/tests/io/data/excel/testdtype.xls similarity index 100% rename from pandas/tests/io/data/testdtype.xls rename to pandas/tests/io/data/excel/testdtype.xls diff --git a/pandas/tests/io/data/testdtype.xlsm b/pandas/tests/io/data/excel/testdtype.xlsm similarity index 100% rename from pandas/tests/io/data/testdtype.xlsm rename to pandas/tests/io/data/excel/testdtype.xlsm diff --git a/pandas/tests/io/data/testdtype.xlsx b/pandas/tests/io/data/excel/testdtype.xlsx similarity index 100% rename from pandas/tests/io/data/testdtype.xlsx rename to pandas/tests/io/data/excel/testdtype.xlsx diff --git a/pandas/tests/io/data/testmultiindex.ods b/pandas/tests/io/data/excel/testmultiindex.ods similarity index 100% rename from pandas/tests/io/data/testmultiindex.ods rename to pandas/tests/io/data/excel/testmultiindex.ods diff --git a/pandas/tests/io/data/testmultiindex.xls b/pandas/tests/io/data/excel/testmultiindex.xls similarity index 100% rename from pandas/tests/io/data/testmultiindex.xls rename to pandas/tests/io/data/excel/testmultiindex.xls diff --git a/pandas/tests/io/data/testmultiindex.xlsm b/pandas/tests/io/data/excel/testmultiindex.xlsm similarity index 100% rename from pandas/tests/io/data/testmultiindex.xlsm rename to pandas/tests/io/data/excel/testmultiindex.xlsm diff --git a/pandas/tests/io/data/testmultiindex.xlsx b/pandas/tests/io/data/excel/testmultiindex.xlsx similarity index 100% rename from pandas/tests/io/data/testmultiindex.xlsx rename to pandas/tests/io/data/excel/testmultiindex.xlsx diff --git a/pandas/tests/io/data/testskiprows.ods b/pandas/tests/io/data/excel/testskiprows.ods similarity index 100% rename from pandas/tests/io/data/testskiprows.ods rename to pandas/tests/io/data/excel/testskiprows.ods diff --git a/pandas/tests/io/data/testskiprows.xls b/pandas/tests/io/data/excel/testskiprows.xls similarity index 100% rename from pandas/tests/io/data/testskiprows.xls rename to pandas/tests/io/data/excel/testskiprows.xls diff --git a/pandas/tests/io/data/testskiprows.xlsm b/pandas/tests/io/data/excel/testskiprows.xlsm similarity index 100% rename from pandas/tests/io/data/testskiprows.xlsm rename to pandas/tests/io/data/excel/testskiprows.xlsm diff --git a/pandas/tests/io/data/testskiprows.xlsx b/pandas/tests/io/data/excel/testskiprows.xlsx similarity index 100% rename from pandas/tests/io/data/testskiprows.xlsx rename to pandas/tests/io/data/excel/testskiprows.xlsx diff --git a/pandas/tests/io/data/times_1900.ods b/pandas/tests/io/data/excel/times_1900.ods similarity index 100% rename from pandas/tests/io/data/times_1900.ods rename to pandas/tests/io/data/excel/times_1900.ods diff --git a/pandas/tests/io/data/times_1900.xls b/pandas/tests/io/data/excel/times_1900.xls similarity index 100% rename from pandas/tests/io/data/times_1900.xls rename to pandas/tests/io/data/excel/times_1900.xls diff --git a/pandas/tests/io/data/times_1900.xlsm b/pandas/tests/io/data/excel/times_1900.xlsm similarity index 100% rename from pandas/tests/io/data/times_1900.xlsm rename to pandas/tests/io/data/excel/times_1900.xlsm diff --git a/pandas/tests/io/data/times_1900.xlsx b/pandas/tests/io/data/excel/times_1900.xlsx similarity index 100% rename from pandas/tests/io/data/times_1900.xlsx rename to pandas/tests/io/data/excel/times_1900.xlsx diff --git a/pandas/tests/io/data/times_1904.ods b/pandas/tests/io/data/excel/times_1904.ods similarity index 100% rename from pandas/tests/io/data/times_1904.ods rename to pandas/tests/io/data/excel/times_1904.ods diff --git a/pandas/tests/io/data/times_1904.xls b/pandas/tests/io/data/excel/times_1904.xls similarity index 100% rename from pandas/tests/io/data/times_1904.xls rename to pandas/tests/io/data/excel/times_1904.xls diff --git a/pandas/tests/io/data/times_1904.xlsm b/pandas/tests/io/data/excel/times_1904.xlsm similarity index 100% rename from pandas/tests/io/data/times_1904.xlsm rename to pandas/tests/io/data/excel/times_1904.xlsm diff --git a/pandas/tests/io/data/times_1904.xlsx b/pandas/tests/io/data/excel/times_1904.xlsx similarity index 100% rename from pandas/tests/io/data/times_1904.xlsx rename to pandas/tests/io/data/excel/times_1904.xlsx diff --git a/pandas/tests/io/data/writertable.odt b/pandas/tests/io/data/excel/writertable.odt similarity index 100% rename from pandas/tests/io/data/writertable.odt rename to pandas/tests/io/data/excel/writertable.odt diff --git a/pandas/tests/io/data/feather-0_3_1.feather b/pandas/tests/io/data/feather/feather-0_3_1.feather similarity index 100% rename from pandas/tests/io/data/feather-0_3_1.feather rename to pandas/tests/io/data/feather/feather-0_3_1.feather diff --git a/pandas/tests/io/data/fixed_width_format.txt b/pandas/tests/io/data/fixed_width/fixed_width_format.txt similarity index 100% rename from pandas/tests/io/data/fixed_width_format.txt rename to pandas/tests/io/data/fixed_width/fixed_width_format.txt diff --git a/pandas/tests/io/data/banklist.html b/pandas/tests/io/data/html/banklist.html similarity index 100% rename from pandas/tests/io/data/banklist.html rename to pandas/tests/io/data/html/banklist.html diff --git a/pandas/tests/io/data/computer_sales_page.html b/pandas/tests/io/data/html/computer_sales_page.html similarity index 100% rename from pandas/tests/io/data/computer_sales_page.html rename to pandas/tests/io/data/html/computer_sales_page.html diff --git a/pandas/tests/io/data/macau.html b/pandas/tests/io/data/html/macau.html similarity index 100% rename from pandas/tests/io/data/macau.html rename to pandas/tests/io/data/html/macau.html diff --git a/pandas/tests/io/data/nyse_wsj.html b/pandas/tests/io/data/html/nyse_wsj.html similarity index 100% rename from pandas/tests/io/data/nyse_wsj.html rename to pandas/tests/io/data/html/nyse_wsj.html diff --git a/pandas/tests/io/data/spam.html b/pandas/tests/io/data/html/spam.html similarity index 100% rename from pandas/tests/io/data/spam.html rename to pandas/tests/io/data/html/spam.html diff --git a/pandas/tests/io/data/valid_markup.html b/pandas/tests/io/data/html/valid_markup.html similarity index 100% rename from pandas/tests/io/data/valid_markup.html rename to pandas/tests/io/data/html/valid_markup.html diff --git a/pandas/tests/io/data/wikipedia_states.html b/pandas/tests/io/data/html/wikipedia_states.html similarity index 100% rename from pandas/tests/io/data/wikipedia_states.html rename to pandas/tests/io/data/html/wikipedia_states.html diff --git a/pandas/tests/io/data/categorical.0.25.0.pickle b/pandas/tests/io/data/pickle/categorical.0.25.0.pickle similarity index 100% rename from pandas/tests/io/data/categorical.0.25.0.pickle rename to pandas/tests/io/data/pickle/categorical.0.25.0.pickle diff --git a/pandas/tests/io/data/sparseframe-0.20.3.pickle.gz b/pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz similarity index 100% rename from pandas/tests/io/data/sparseframe-0.20.3.pickle.gz rename to pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz diff --git a/pandas/tests/io/data/sparseseries-0.20.3.pickle.gz b/pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz similarity index 100% rename from pandas/tests/io/data/sparseseries-0.20.3.pickle.gz rename to pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz diff --git a/pandas/tests/io/data/test_py27.pkl b/pandas/tests/io/data/pickle/test_py27.pkl similarity index 100% rename from pandas/tests/io/data/test_py27.pkl rename to pandas/tests/io/data/pickle/test_py27.pkl diff --git a/pandas/tests/io/data/labelled-num-na.sav b/pandas/tests/io/data/spss/labelled-num-na.sav similarity index 100% rename from pandas/tests/io/data/labelled-num-na.sav rename to pandas/tests/io/data/spss/labelled-num-na.sav diff --git a/pandas/tests/io/data/labelled-num.sav b/pandas/tests/io/data/spss/labelled-num.sav similarity index 100% rename from pandas/tests/io/data/labelled-num.sav rename to pandas/tests/io/data/spss/labelled-num.sav diff --git a/pandas/tests/io/data/labelled-str.sav b/pandas/tests/io/data/spss/labelled-str.sav similarity index 100% rename from pandas/tests/io/data/labelled-str.sav rename to pandas/tests/io/data/spss/labelled-str.sav diff --git a/pandas/tests/io/data/umlauts.sav b/pandas/tests/io/data/spss/umlauts.sav similarity index 100% rename from pandas/tests/io/data/umlauts.sav rename to pandas/tests/io/data/spss/umlauts.sav diff --git a/pandas/tests/io/data/S4_EDUC1.dta b/pandas/tests/io/data/stata/S4_EDUC1.dta similarity index 100% rename from pandas/tests/io/data/S4_EDUC1.dta rename to pandas/tests/io/data/stata/S4_EDUC1.dta diff --git a/pandas/tests/io/data/stata10_115.dta b/pandas/tests/io/data/stata/stata10_115.dta similarity index 100% rename from pandas/tests/io/data/stata10_115.dta rename to pandas/tests/io/data/stata/stata10_115.dta diff --git a/pandas/tests/io/data/stata10_117.dta b/pandas/tests/io/data/stata/stata10_117.dta similarity index 100% rename from pandas/tests/io/data/stata10_117.dta rename to pandas/tests/io/data/stata/stata10_117.dta diff --git a/pandas/tests/io/data/stata11_115.dta b/pandas/tests/io/data/stata/stata11_115.dta similarity index 100% rename from pandas/tests/io/data/stata11_115.dta rename to pandas/tests/io/data/stata/stata11_115.dta diff --git a/pandas/tests/io/data/stata11_117.dta b/pandas/tests/io/data/stata/stata11_117.dta similarity index 100% rename from pandas/tests/io/data/stata11_117.dta rename to pandas/tests/io/data/stata/stata11_117.dta diff --git a/pandas/tests/io/data/stata12_117.dta b/pandas/tests/io/data/stata/stata12_117.dta similarity index 100% rename from pandas/tests/io/data/stata12_117.dta rename to pandas/tests/io/data/stata/stata12_117.dta diff --git a/pandas/tests/io/data/stata13_dates.dta b/pandas/tests/io/data/stata/stata13_dates.dta similarity index 100% rename from pandas/tests/io/data/stata13_dates.dta rename to pandas/tests/io/data/stata/stata13_dates.dta diff --git a/pandas/tests/io/data/stata14_118.dta b/pandas/tests/io/data/stata/stata14_118.dta similarity index 100% rename from pandas/tests/io/data/stata14_118.dta rename to pandas/tests/io/data/stata/stata14_118.dta diff --git a/pandas/tests/io/data/stata15.dta b/pandas/tests/io/data/stata/stata15.dta similarity index 100% rename from pandas/tests/io/data/stata15.dta rename to pandas/tests/io/data/stata/stata15.dta diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata/stata16_118.dta similarity index 100% rename from pandas/tests/io/data/stata16_118.dta rename to pandas/tests/io/data/stata/stata16_118.dta diff --git a/pandas/tests/io/data/stata1_114.dta b/pandas/tests/io/data/stata/stata1_114.dta similarity index 100% rename from pandas/tests/io/data/stata1_114.dta rename to pandas/tests/io/data/stata/stata1_114.dta diff --git a/pandas/tests/io/data/stata1_117.dta b/pandas/tests/io/data/stata/stata1_117.dta similarity index 100% rename from pandas/tests/io/data/stata1_117.dta rename to pandas/tests/io/data/stata/stata1_117.dta diff --git a/pandas/tests/io/data/stata1_119.dta.gz b/pandas/tests/io/data/stata/stata1_119.dta.gz similarity index 100% rename from pandas/tests/io/data/stata1_119.dta.gz rename to pandas/tests/io/data/stata/stata1_119.dta.gz diff --git a/pandas/tests/io/data/stata1_encoding.dta b/pandas/tests/io/data/stata/stata1_encoding.dta similarity index 100% rename from pandas/tests/io/data/stata1_encoding.dta rename to pandas/tests/io/data/stata/stata1_encoding.dta diff --git a/pandas/tests/io/data/stata1_encoding_118.dta b/pandas/tests/io/data/stata/stata1_encoding_118.dta similarity index 100% rename from pandas/tests/io/data/stata1_encoding_118.dta rename to pandas/tests/io/data/stata/stata1_encoding_118.dta diff --git a/pandas/tests/io/data/stata2_113.dta b/pandas/tests/io/data/stata/stata2_113.dta similarity index 100% rename from pandas/tests/io/data/stata2_113.dta rename to pandas/tests/io/data/stata/stata2_113.dta diff --git a/pandas/tests/io/data/stata2_114.dta b/pandas/tests/io/data/stata/stata2_114.dta similarity index 100% rename from pandas/tests/io/data/stata2_114.dta rename to pandas/tests/io/data/stata/stata2_114.dta diff --git a/pandas/tests/io/data/stata2_115.dta b/pandas/tests/io/data/stata/stata2_115.dta similarity index 100% rename from pandas/tests/io/data/stata2_115.dta rename to pandas/tests/io/data/stata/stata2_115.dta diff --git a/pandas/tests/io/data/stata2_117.dta b/pandas/tests/io/data/stata/stata2_117.dta similarity index 100% rename from pandas/tests/io/data/stata2_117.dta rename to pandas/tests/io/data/stata/stata2_117.dta diff --git a/pandas/tests/io/data/stata3.csv b/pandas/tests/io/data/stata/stata3.csv similarity index 100% rename from pandas/tests/io/data/stata3.csv rename to pandas/tests/io/data/stata/stata3.csv diff --git a/pandas/tests/io/data/stata3_113.dta b/pandas/tests/io/data/stata/stata3_113.dta similarity index 100% rename from pandas/tests/io/data/stata3_113.dta rename to pandas/tests/io/data/stata/stata3_113.dta diff --git a/pandas/tests/io/data/stata3_114.dta b/pandas/tests/io/data/stata/stata3_114.dta similarity index 100% rename from pandas/tests/io/data/stata3_114.dta rename to pandas/tests/io/data/stata/stata3_114.dta diff --git a/pandas/tests/io/data/stata3_115.dta b/pandas/tests/io/data/stata/stata3_115.dta similarity index 100% rename from pandas/tests/io/data/stata3_115.dta rename to pandas/tests/io/data/stata/stata3_115.dta diff --git a/pandas/tests/io/data/stata3_117.dta b/pandas/tests/io/data/stata/stata3_117.dta similarity index 100% rename from pandas/tests/io/data/stata3_117.dta rename to pandas/tests/io/data/stata/stata3_117.dta diff --git a/pandas/tests/io/data/stata4_113.dta b/pandas/tests/io/data/stata/stata4_113.dta similarity index 100% rename from pandas/tests/io/data/stata4_113.dta rename to pandas/tests/io/data/stata/stata4_113.dta diff --git a/pandas/tests/io/data/stata4_114.dta b/pandas/tests/io/data/stata/stata4_114.dta similarity index 100% rename from pandas/tests/io/data/stata4_114.dta rename to pandas/tests/io/data/stata/stata4_114.dta diff --git a/pandas/tests/io/data/stata4_115.dta b/pandas/tests/io/data/stata/stata4_115.dta similarity index 100% rename from pandas/tests/io/data/stata4_115.dta rename to pandas/tests/io/data/stata/stata4_115.dta diff --git a/pandas/tests/io/data/stata4_117.dta b/pandas/tests/io/data/stata/stata4_117.dta similarity index 100% rename from pandas/tests/io/data/stata4_117.dta rename to pandas/tests/io/data/stata/stata4_117.dta diff --git a/pandas/tests/io/data/stata5.csv b/pandas/tests/io/data/stata/stata5.csv similarity index 100% rename from pandas/tests/io/data/stata5.csv rename to pandas/tests/io/data/stata/stata5.csv diff --git a/pandas/tests/io/data/stata5_113.dta b/pandas/tests/io/data/stata/stata5_113.dta similarity index 100% rename from pandas/tests/io/data/stata5_113.dta rename to pandas/tests/io/data/stata/stata5_113.dta diff --git a/pandas/tests/io/data/stata5_114.dta b/pandas/tests/io/data/stata/stata5_114.dta similarity index 100% rename from pandas/tests/io/data/stata5_114.dta rename to pandas/tests/io/data/stata/stata5_114.dta diff --git a/pandas/tests/io/data/stata5_115.dta b/pandas/tests/io/data/stata/stata5_115.dta similarity index 100% rename from pandas/tests/io/data/stata5_115.dta rename to pandas/tests/io/data/stata/stata5_115.dta diff --git a/pandas/tests/io/data/stata5_117.dta b/pandas/tests/io/data/stata/stata5_117.dta similarity index 100% rename from pandas/tests/io/data/stata5_117.dta rename to pandas/tests/io/data/stata/stata5_117.dta diff --git a/pandas/tests/io/data/stata6.csv b/pandas/tests/io/data/stata/stata6.csv similarity index 100% rename from pandas/tests/io/data/stata6.csv rename to pandas/tests/io/data/stata/stata6.csv diff --git a/pandas/tests/io/data/stata6_113.dta b/pandas/tests/io/data/stata/stata6_113.dta similarity index 100% rename from pandas/tests/io/data/stata6_113.dta rename to pandas/tests/io/data/stata/stata6_113.dta diff --git a/pandas/tests/io/data/stata6_114.dta b/pandas/tests/io/data/stata/stata6_114.dta similarity index 100% rename from pandas/tests/io/data/stata6_114.dta rename to pandas/tests/io/data/stata/stata6_114.dta diff --git a/pandas/tests/io/data/stata6_115.dta b/pandas/tests/io/data/stata/stata6_115.dta similarity index 100% rename from pandas/tests/io/data/stata6_115.dta rename to pandas/tests/io/data/stata/stata6_115.dta diff --git a/pandas/tests/io/data/stata6_117.dta b/pandas/tests/io/data/stata/stata6_117.dta similarity index 100% rename from pandas/tests/io/data/stata6_117.dta rename to pandas/tests/io/data/stata/stata6_117.dta diff --git a/pandas/tests/io/data/stata7_111.dta b/pandas/tests/io/data/stata/stata7_111.dta similarity index 100% rename from pandas/tests/io/data/stata7_111.dta rename to pandas/tests/io/data/stata/stata7_111.dta diff --git a/pandas/tests/io/data/stata7_115.dta b/pandas/tests/io/data/stata/stata7_115.dta similarity index 100% rename from pandas/tests/io/data/stata7_115.dta rename to pandas/tests/io/data/stata/stata7_115.dta diff --git a/pandas/tests/io/data/stata7_117.dta b/pandas/tests/io/data/stata/stata7_117.dta similarity index 100% rename from pandas/tests/io/data/stata7_117.dta rename to pandas/tests/io/data/stata/stata7_117.dta diff --git a/pandas/tests/io/data/stata8_113.dta b/pandas/tests/io/data/stata/stata8_113.dta similarity index 100% rename from pandas/tests/io/data/stata8_113.dta rename to pandas/tests/io/data/stata/stata8_113.dta diff --git a/pandas/tests/io/data/stata8_115.dta b/pandas/tests/io/data/stata/stata8_115.dta similarity index 100% rename from pandas/tests/io/data/stata8_115.dta rename to pandas/tests/io/data/stata/stata8_115.dta diff --git a/pandas/tests/io/data/stata8_117.dta b/pandas/tests/io/data/stata/stata8_117.dta similarity index 100% rename from pandas/tests/io/data/stata8_117.dta rename to pandas/tests/io/data/stata/stata8_117.dta diff --git a/pandas/tests/io/data/stata9_115.dta b/pandas/tests/io/data/stata/stata9_115.dta similarity index 100% rename from pandas/tests/io/data/stata9_115.dta rename to pandas/tests/io/data/stata/stata9_115.dta diff --git a/pandas/tests/io/data/stata9_117.dta b/pandas/tests/io/data/stata/stata9_117.dta similarity index 100% rename from pandas/tests/io/data/stata9_117.dta rename to pandas/tests/io/data/stata/stata9_117.dta diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index 843b3c08421b3..6ec2f477a442d 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -24,11 +24,12 @@ def merge_cells(request): @pytest.fixture -def df_ref(): +def df_ref(datapath): """ Obtain the reference data from read_csv with the Python engine. """ - df_ref = read_csv("test1.csv", index_col=0, parse_dates=True, engine="python") + filepath = datapath("io", "data", "csv", "test1.csv") + df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") return df_ref diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 47e610562a388..6e5610f4f5838 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -13,7 +13,7 @@ def cd_and_set_engine(monkeypatch, datapath): func = functools.partial(pd.read_excel, engine="odf") monkeypatch.setattr(pd, "read_excel", func) - monkeypatch.chdir(datapath("io", "data")) + monkeypatch.chdir(datapath("io", "data", "excel")) def test_read_invalid_types_raises(): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 1d3653f685e1e..e4b7d683b4c3b 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -81,34 +81,25 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): pytest.skip() func = partial(pd.read_excel, engine=engine) - monkeypatch.chdir(datapath("io", "data")) + monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "read_excel", func) def test_usecols_int(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["A", "B", "C"]) # usecols as int - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + msg = "Passing an integer for `usecols`" + with pytest.raises(ValueError, match=msg): with ignore_xlrd_time_clock_warning(): - df1 = pd.read_excel( - "test1" + read_ext, "Sheet1", index_col=0, usecols=3 - ) + pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols=3) # usecols as int - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + with pytest.raises(ValueError, match=msg): with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel( + pd.read_excel( "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3 ) - # TODO add index to xls file) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_list(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["B", "C"]) @@ -499,12 +490,10 @@ def test_bad_engine_raises(self, read_ext): @tm.network def test_read_from_http_url(self, read_ext): - if read_ext == ".ods": # TODO: remove once on master - pytest.skip() url = ( - "https://raw.github.com/pandas-dev/pandas/master/" - "pandas/tests/io/data/test1" + read_ext + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + "pandas/tests/io/data/excel/test1" + read_ext ) url_table = pd.read_excel(url) local_table = pd.read_excel("test1" + read_ext) @@ -527,7 +516,7 @@ def test_read_from_s3_url(self, read_ext, s3_resource): def test_read_from_file_url(self, read_ext, datapath): # FILE - localtable = os.path.join(datapath("io", "data"), "test1" + read_ext) + localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext) local_table = pd.read_excel(localtable) try: @@ -828,7 +817,7 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): pytest.skip() func = partial(pd.ExcelFile, engine=engine) - monkeypatch.chdir(datapath("io", "data")) + monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "ExcelFile", func) def test_excel_passes_na(self, read_ext): @@ -895,7 +884,7 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): def test_unexpected_kwargs_raises(self, read_ext, arg): # gh-17964 kwarg = {arg: "Sheet1"} - msg = "unexpected keyword argument `{}`".format(arg) + msg = r"unexpected keyword argument `{}`".format(arg) with pd.ExcelFile("test1" + read_ext) as excel: with pytest.raises(TypeError, match=msg): @@ -919,14 +908,6 @@ def test_excel_table_sheet_by_index(self, read_ext, df_ref): df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): - with pd.ExcelFile("test1" + read_ext) as excel: - df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) - - tm.assert_frame_equal(df3, df4) - with pd.ExcelFile("test1" + read_ext) as excel: df3 = excel.parse(0, index_col=0, skipfooter=1) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 1bc4ad3e7867a..a7730e079a1bb 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -6,7 +6,6 @@ import numpy as np import pytest -from pandas.compat import PY36 import pandas.util._test_decorators as td import pandas as pd @@ -1262,7 +1261,6 @@ def check_called(func): @td.skip_if_no("xlrd") @td.skip_if_no("openpyxl") -@pytest.mark.skipif(not PY36, reason="requires fspath") class TestFSPath: def test_excelfile_fspath(self): with tm.ensure_clean("foo.xlsx") as path: diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index fc36be9e1b738..e04dfc97d4968 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -35,7 +35,7 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): - path = datapath("io", "data", "test1{}".format(read_ext)) + path = datapath("io", "data", "excel", "test1{}".format(read_ext)) with pd.ExcelFile(path) as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, "asdf") diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 704de378b0909..0f4a7a33dd115 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -18,7 +18,7 @@ import pytest import pytz -from pandas.compat import PY36, is_platform_32bit, is_platform_windows +from pandas.compat import is_platform_32bit, is_platform_windows import pandas as pd from pandas import ( @@ -62,10 +62,7 @@ def filepath_or_buffer(filepath_or_buffer_id, tmp_path): yield buf assert not buf.closed else: - if PY36: - assert isinstance(tmp_path, Path) - else: - assert hasattr(tmp_path, "__fspath__") + assert isinstance(tmp_path, Path) if filepath_or_buffer_id == "pathlike": yield tmp_path / "foo" else: diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index e5af74bdd4d33..5a3afb5025e51 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -390,7 +390,7 @@ def test_applymap_subset_multiindex_code(self): def color_negative_red(val): color = "red" if val < 0 else "black" - return "color: %s" % color + return f"color: {color}" df.loc[pct_subset] df.style.applymap(color_negative_red, subset=pct_subset) @@ -1009,6 +1009,75 @@ def test_bar_bad_align_raises(self): with pytest.raises(ValueError): df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) + def test_format_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + + ctx = df.style.format("{:.2%}", na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "110.00%" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate() + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + def test_init_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = Styler(df, na_rep="NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + def test_set_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + ctx = ( + df.style.set_na_rep("NA") + .format(None, na_rep="-", subset=["B"]) + ._translate() + ) + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "-" + + def test_format_non_numeric_na(self): + # GH 21527 28358 + df = pd.DataFrame( + { + "object": [None, np.nan, "foo"], + "datetime": [None, pd.NaT, pd.Timestamp("20120101")], + } + ) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + assert ctx["body"][1][1]["display_value"] == "NA" + assert ctx["body"][1][2]["display_value"] == "NA" + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "-" + + def test_format_with_bad_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + with pytest.raises(TypeError): + df.style.format(None, na_rep=-1) + def test_highlight_null(self, null_color="red"): df = pd.DataFrame({"A": [0, np.nan]}) result = df.style.highlight_null()._compute().ctx diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 1440b0a6f06f1..4d8edec7c7f14 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -262,7 +262,7 @@ def test_css_to_excel_inherited(css, inherited, expected): @pytest.mark.parametrize( "input_color,output_color", ( - [(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] + list(CSSToExcelConverter.NAMED_COLORS.items()) + [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] + [("#F0F", "FF00FF"), ("#ABC", "AABBCC")] ), diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a3ca61cb1eb63..c71c52bce87b8 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import PY36 - from pandas import DataFrame, Index import pandas.util.testing as tm @@ -382,7 +380,7 @@ def test_missing_field(self, author_missing_data): }, ] expected = DataFrame(ex_data) - tm.assert_frame_equal(result, expected, check_like=not PY36) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "max_level,expected", @@ -524,7 +522,7 @@ def test_missing_meta(self, missing_metadata): columns = ["city", "number", "state", "street", "zip", "name"] columns = ["number", "street", "city", "state", "zip", "name"] expected = DataFrame(ex_data, columns=columns) - tm.assert_frame_equal(result, expected, check_like=not PY36) + tm.assert_frame_equal(result, expected) def test_donot_drop_nonevalues(self): # GH21356 diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 05f97a1769205..c4e03e24a7495 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -173,3 +173,14 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): tm.assert_frame_equal( orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize) ) + + +def test_readjson_unicode(monkeypatch): + with tm.ensure_clean("test.json") as path: + monkeypatch.setattr("_bootlocale.getpreferredencoding", lambda l: "cp949") + with open(path, "w", encoding="utf-8") as f: + f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') + + result = read_json(path) + expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 183ad500b15f3..a87e1e796c194 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -7,9 +7,9 @@ class BaseParser: - engine = None # type: Optional[str] + engine: Optional[str] = None low_memory = True - float_precision_choices = [] # type: List[Optional[str]] + float_precision_choices: List[Optional[str]] = [] def update_kwargs(self, kwargs): kwargs = kwargs.copy() diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 6e6c31bc5b972..590f26a76802a 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2160,10 +2160,6 @@ def test_suppress_error_output(all_parsers, capsys): assert captured.err == "" -@pytest.mark.skipif( - compat.is_platform_windows() and not compat.PY36, - reason="On Python < 3.6 won't pass on Windows", -) @pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"]) def test_filename_with_special_chars(all_parsers, filename): # see gh-15086. diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 4dfb8d3bd2dc8..66e00f4eb6c1c 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -21,9 +21,7 @@ def test_index_col_named(all_parsers, with_header): KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa - header = ( - "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" - ) # noqa + header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" if with_header: data = header + no_header diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index dbe721b10a3ce..57e2950b06ce8 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -166,7 +166,7 @@ def test_s3_fails(self): # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): - read_csv("s3://cant_get_it/") + read_csv("s3://cant_get_it/file.csv") def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -184,6 +184,8 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): def test_read_csv_chunked_download(self, s3_resource, caplog): # 8 MB, S3FS usees 5MB chunks + import s3fs + df = DataFrame(np.random.randn(100000, 4), columns=list("abcd")) buf = BytesIO() str_buf = StringIO() @@ -194,7 +196,13 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) - with caplog.at_level(logging.DEBUG, logger="s3fs.core"): + # Possibly some state leaking in between tests. + # If we don't clear this cache, we saw `GetObject operation: Forbidden`. + # Presumably the s3fs instance is being cached, with the directory listing + # from *before* we add the large-file.csv in the pandas-test bucket. + s3fs.S3FileSystem.clear_instance_cache() + + with caplog.at_level(logging.DEBUG, logger="s3fs"): read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) assert (0, 5505024) in {x.args[-2:] for x in caplog.records} diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index f9d525399bde3..d79280f9ea494 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas.compat import PY36, is_platform_little_endian, is_platform_windows +from pandas.compat import is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_categorical_dtype @@ -2806,16 +2806,16 @@ def test_select_iterator(self, setup_path): expected = store.select("df") - results = [s for s in store.select("df", iterator=True)] + results = list(store.select("df", iterator=True)) result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select("df", chunksize=100)] + results = list(store.select("df", chunksize=100)) assert len(results) == 5 result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select("df", chunksize=150)] + results = list(store.select("df", chunksize=150)) result = concat(results) tm.assert_frame_equal(result, expected) @@ -2835,7 +2835,7 @@ def test_select_iterator(self, setup_path): df = tm.makeTimeDataFrame(500) df.to_hdf(path, "df", format="table") - results = [s for s in read_hdf(path, "df", chunksize=100)] + results = list(read_hdf(path, "df", chunksize=100)) result = concat(results) assert len(results) == 5 @@ -2856,12 +2856,9 @@ def test_select_iterator(self, setup_path): # full selection expected = store.select_as_multiple(["df1", "df2"], selector="df1") - results = [ - s - for s in store.select_as_multiple( - ["df1", "df2"], selector="df1", chunksize=150 - ) - ] + results = list( + store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) + ) result = concat(results) tm.assert_frame_equal(expected, result) @@ -2916,19 +2913,19 @@ def test_select_iterator_complete_8014(self, setup_path): end_dt = expected.index[-1] # select w/iterator and no where clause works - results = [s for s in store.select("df", chunksize=chunksize)] + results = list(store.select("df", chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, begin of range where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, end of range where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) @@ -2936,7 +2933,7 @@ def test_select_iterator_complete_8014(self, setup_path): where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) @@ -2958,14 +2955,14 @@ def test_select_iterator_non_complete_8014(self, setup_path): # select w/iterator and where clause, single term, begin of range where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) @@ -2974,7 +2971,7 @@ def test_select_iterator_non_complete_8014(self, setup_path): where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[ (expected.index >= beg_dt) & (expected.index <= end_dt) @@ -2992,7 +2989,7 @@ def test_select_iterator_non_complete_8014(self, setup_path): # select w/iterator and where clause, single term, begin of range where = "index > '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) assert 0 == len(results) def test_select_iterator_many_empty_frames(self, setup_path): @@ -3014,14 +3011,14 @@ def test_select_iterator_many_empty_frames(self, setup_path): # select w/iterator and where clause, single term, begin of range where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) assert len(results) == 1 result = concat(results) @@ -3032,7 +3029,7 @@ def test_select_iterator_many_empty_frames(self, setup_path): where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) # should be 1, is 10 assert len(results) == 1 @@ -3052,7 +3049,7 @@ def test_select_iterator_many_empty_frames(self, setup_path): where = "index <= '{beg_dt}' & index >= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) # should be [] assert len(results) == 0 @@ -4711,7 +4708,6 @@ def test_read_hdf_series_mode_r(self, format, setup_path): result = pd.read_hdf(path, key="data", mode="r") tm.assert_series_equal(result, series) - @pytest.mark.skipif(not PY36, reason="Need python 3.6") def test_fspath(self): with tm.ensure_clean("foo.h5") as path: with pd.HDFStore(path) as store: diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 7893877be2033..a52b22122ba81 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -104,7 +104,7 @@ def test1_incremental(self): reader = read_sas(self.file01, index="SEQN", chunksize=1000) - all_data = [x for x in reader] + all_data = list(reader) data = pd.concat(all_data, axis=0) tm.assert_frame_equal(data, data_csv, check_index_type=False) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 4559ba264d8b7..666dfd245acaa 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -258,6 +258,7 @@ def test_round_trip_valid_encodings(self, enc, df): @pytest.mark.clipboard @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."]) +@pytest.mark.xfail(reason="flaky in CI", strict=False) def test_raw_roundtrip(data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows clipboard_set(data) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 655fd9d01c1c0..2af370a696860 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -208,21 +208,33 @@ def test_read_expands_user_home_dir( @pytest.mark.parametrize( "reader, module, path", [ - (pd.read_csv, "os", ("io", "data", "iris.csv")), - (pd.read_table, "os", ("io", "data", "iris.csv")), - (pd.read_fwf, "os", ("io", "data", "fixed_width_format.txt")), - (pd.read_excel, "xlrd", ("io", "data", "test1.xlsx")), - (pd.read_feather, "feather", ("io", "data", "feather-0_3_1.feather")), + (pd.read_csv, "os", ("data", "iris.csv")), + (pd.read_table, "os", ("data", "iris.csv")), + ( + pd.read_fwf, + "os", + ("io", "data", "fixed_width", "fixed_width_format.txt"), + ), + (pd.read_excel, "xlrd", ("io", "data", "excel", "test1.xlsx")), + ( + pd.read_feather, + "feather", + ("io", "data", "feather", "feather-0_3_1.feather"), + ), ( pd.read_hdf, "tables", ("io", "data", "legacy_hdf", "datetimetz_object.h5"), ), - (pd.read_stata, "os", ("io", "data", "stata10_115.dta")), + (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")), (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")), (pd.read_msgpack, "os", ("io", "msgpack", "data", "frame.mp")), - (pd.read_pickle, "os", ("io", "data", "categorical.0.25.0.pickle")), + ( + pd.read_pickle, + "os", + ("io", "data", "pickle", "categorical.0.25.0.pickle"), + ), ], ) def test_read_fspath_all(self, reader, module, path, datapath): @@ -296,7 +308,7 @@ def test_write_fspath_hdf5(self): @pytest.fixture def mmap_file(datapath): - return datapath("io", "data", "test_mmap.csv") + return datapath("io", "data", "csv", "test_mmap.csv") class TestMMapWrapper: diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index d68b6a1effaa0..9bcdda2039458 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -140,7 +140,7 @@ def test_with_missing_lzma(): import pandas """ ) - subprocess.check_output([sys.executable, "-c", code]) + subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) def test_with_missing_lzma_runtime(): @@ -157,4 +157,4 @@ def test_with_missing_lzma_runtime(): df.to_csv('foo.csv', compression='xz') """ ) - subprocess.check_output([sys.executable, "-c", code]) + subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 0f68a6534dad1..e06f2c31a2870 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -107,23 +107,6 @@ def test_unsupported_other(self): # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) - def test_rw_nthreads(self): - df = pd.DataFrame({"A": np.arange(100000)}) - expected_warning = ( - "the 'nthreads' keyword is deprecated, use 'use_threads' instead" - ) - # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: - self.check_round_trip(df, nthreads=2) - # we have an extra FutureWarning because of #GH23752 - assert any(expected_warning in str(x) for x in w) - - # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: - self.check_round_trip(df, nthreads=1) - # we have an extra FutureWarnings because of #GH23752 - assert any(expected_warning in str(x) for x in w) - def test_rw_use_threads(self): df = pd.DataFrame({"A": np.arange(100000)}) self.check_round_trip(df, use_threads=True) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 3d855a12d5481..d8d617ceeebff 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -63,7 +63,7 @@ def test_bs4_version_fails(monkeypatch, datapath): monkeypatch.setattr(bs4, "__version__", "4.2") with pytest.raises(ImportError, match="Pandas requires version"): - read_html(datapath("io", "data", "spam.html"), flavor="bs4") + read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4") def test_invalid_flavor(): @@ -78,7 +78,7 @@ def test_invalid_flavor(): @td.skip_if_no("bs4") @td.skip_if_no("lxml") def test_same_ordering(datapath): - filename = datapath("io", "data", "valid_markup.html") + filename = datapath("io", "data", "html", "valid_markup.html") dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) assert_framelist_equal(dfs_lxml, dfs_bs4) @@ -95,10 +95,10 @@ def test_same_ordering(datapath): class TestReadHtml: @pytest.fixture(autouse=True) def set_files(self, datapath): - self.spam_data = datapath("io", "data", "spam.html") + self.spam_data = datapath("io", "data", "html", "spam.html") self.spam_data_kwargs = {} self.spam_data_kwargs["encoding"] = "UTF-8" - self.banklist_data = datapath("io", "data", "banklist.html") + self.banklist_data = datapath("io", "data", "html", "banklist.html") @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor, request): @@ -135,7 +135,7 @@ def test_banklist_url(self): def test_spam_url(self): url = ( "https://raw.githubusercontent.com/pandas-dev/pandas/master/" - "pandas/tests/io/data/spam.html" + "pandas/tests/io/data/html/spam.html" ) df1 = self.read_html(url, ".*Water.*") df2 = self.read_html(url, "Unit") @@ -376,7 +376,7 @@ def test_python_docs_table(self): @pytest.mark.slow def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 - macau_data = datapath("io", "data", "macau.html") + macau_data = datapath("io", "data", "html", "macau.html") dfs = self.read_html(macau_data, index_col=0, attrs={"class": "style1"}) df = dfs[all_non_nan_table_index] @@ -385,7 +385,7 @@ def test_thousands_macau_stats(self, datapath): @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 - macau_data = datapath("io", "data", "macau.html") + macau_data = datapath("io", "data", "html", "macau.html") dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -566,7 +566,7 @@ def test_parse_header_of_non_string_column(self): tm.assert_frame_equal(result, expected) def test_nyse_wsj_commas_table(self, datapath): - data = datapath("io", "data", "nyse_wsj.html") + data = datapath("io", "data", "html", "nyse_wsj.html") df = self.read_html(data, index_col=0, header=0, attrs={"class": "mdcTable"})[0] expected = Index( @@ -594,7 +594,7 @@ def try_remove_ws(x): df = self.read_html(self.banklist_data, "Metcalf", attrs={"id": "table"})[0] ground_truth = read_csv( - datapath("io", "data", "banklist.csv"), + datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, ) assert df.shape == ground_truth.shape @@ -889,7 +889,7 @@ def test_parse_dates_combine(self): tm.assert_frame_equal(newdf, res[0]) def test_computer_sales_page(self, datapath): - data = datapath("io", "data", "computer_sales_page.html") + data = datapath("io", "data", "html", "computer_sales_page.html") msg = ( r"Passed header=\[0,1\] are too many " r"rows for this multi_index of columns" @@ -897,11 +897,11 @@ def test_computer_sales_page(self, datapath): with pytest.raises(ParserError, match=msg): self.read_html(data, header=[0, 1]) - data = datapath("io", "data", "computer_sales_page.html") + data = datapath("io", "data", "html", "computer_sales_page.html") assert self.read_html(data, header=[1, 2]) def test_wikipedia_states_table(self, datapath): - data = datapath("io", "data", "wikipedia_states.html") + data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), "{data!r} is not a file".format(data=data) assert os.path.getsize(data), "{data!r} is an empty file".format(data=data) result = self.read_html(data, "Arizona", header=1)[0] @@ -1095,14 +1095,14 @@ def test_multiple_header_rows(self): tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath): - filename = datapath("io", "data", "valid_markup.html") + filename = datapath("io", "data", "html", "valid_markup.html") dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow def test_fallback_success(self, datapath): - banklist_data = datapath("io", "data", "banklist.html") + banklist_data = datapath("io", "data", "html", "banklist.html") self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): @@ -1240,7 +1240,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = datapath("io", "data", "valid_markup.html") + filename = datapath("io", "data", "html", "valid_markup.html") helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index debc797fe6e88..3e687d185df84 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -405,7 +405,7 @@ def test_write_ignoring_index(self, engine): ["one", "two", "one", "two", "one", "two", "one", "two"], ] df = pd.DataFrame( - {"one": [i for i in range(8)], "two": [-i for i in range(8)]}, index=arrays + {"one": list(range(8)), "two": [-i for i in range(8)]}, index=arrays ) expected = df.reset_index(drop=True) @@ -514,18 +514,24 @@ def test_additional_extension_arrays(self, pa): "b": pd.Series(["a", None, "c"], dtype="string"), } ) - # currently de-serialized as plain int / object - expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df + else: + # de-serialized as plain int / object + expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) check_round_trip(df, pa, expected=expected) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) - # if missing values in integer, currently de-serialized as float - expected = df.assign(a=df.a.astype("float64")) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df + else: + # if missing values in integer, currently de-serialized as float + expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) class TestParquetFastParquet(Base): - @td.skip_if_no("fastparquet", min_version="0.2.1") + @td.skip_if_no("fastparquet", min_version="0.3.2") def test_basic(self, fp, df_full): df = df_full diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 23a16c885687f..3be966edef080 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -202,23 +202,25 @@ def test_legacy_sparse_warning(datapath): Generated with >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [0, 0, 1, 1]}).to_sparse() - >>> df.to_pickle("pandas/tests/io/data/sparseframe-0.20.3.pickle.gz", + >>> df.to_pickle("pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz", ... compression="gzip") >>> s = df['B'] - >>> s.to_pickle("pandas/tests/io/data/sparseseries-0.20.3.pickle.gz", + >>> s.to_pickle("pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz", ... compression="gzip") """ with tm.assert_produces_warning(FutureWarning): simplefilter("ignore", DeprecationWarning) # from boto pd.read_pickle( - datapath("io", "data", "sparseseries-0.20.3.pickle.gz"), compression="gzip" + datapath("io", "data", "pickle", "sparseseries-0.20.3.pickle.gz"), + compression="gzip", ) with tm.assert_produces_warning(FutureWarning): simplefilter("ignore", DeprecationWarning) # from boto pd.read_pickle( - datapath("io", "data", "sparseframe-0.20.3.pickle.gz"), compression="gzip" + datapath("io", "data", "pickle", "sparseframe-0.20.3.pickle.gz"), + compression="gzip", ) @@ -382,7 +384,7 @@ def test_read(self, protocol, get_random_path): def test_unicode_decode_error(): # pickle file written with py27, should be readable without raising # UnicodeDecodeError, see GH#28645 - path = os.path.join(os.path.dirname(__file__), "data", "test_py27.pkl") + path = os.path.join(os.path.dirname(__file__), "data", "pickle", "test_py27.pkl") df = pd.read_pickle(path) # just test the columns are correct since the values are random diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index ca84156d104fc..ccf3167d49371 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -9,7 +9,7 @@ def test_spss_labelled_num(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "labelled-num.sav") + fname = datapath("io", "data", "spss", "labelled-num.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) @@ -23,7 +23,7 @@ def test_spss_labelled_num(datapath): def test_spss_labelled_num_na(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "labelled-num-na.sav") + fname = datapath("io", "data", "spss", "labelled-num-na.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"VAR00002": ["This is one", None]}) @@ -37,7 +37,7 @@ def test_spss_labelled_num_na(datapath): def test_spss_labelled_str(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "labelled-str.sav") + fname = datapath("io", "data", "spss", "labelled-str.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"gender": ["Male", "Female"]}) @@ -51,7 +51,7 @@ def test_spss_labelled_str(datapath): def test_spss_umlauts(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "umlauts.sav") + fname = datapath("io", "data", "spss", "umlauts.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame( @@ -67,7 +67,7 @@ def test_spss_umlauts(datapath): def test_spss_usecols(datapath): # usecols must be list-like - fname = datapath("io", "data", "labelled-num.sav") + fname = datapath("io", "data", "spss", "labelled-num.sav") with pytest.raises(TypeError, match="usecols must be list-like."): pd.read_spss(fname, usecols="VAR00002") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 183a47c6039ec..fe65820a7c975 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -275,7 +275,7 @@ def _get_exec(self): else: return self.conn.cursor() - @pytest.fixture(params=[("io", "data", "iris.csv")]) + @pytest.fixture(params=[("data", "iris.csv")]) def load_iris_data(self, datapath, request): import io @@ -583,7 +583,7 @@ class _TestSQLApi(PandasSQLTest): """ flavor = "sqlite" - mode = None # type: str + mode: str def setup_connect(self): self.conn = self.connect() @@ -1234,7 +1234,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ - flavor = None # type: str + flavor: str @pytest.fixture(autouse=True, scope="class") def setup_class(cls): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a0ec06a2197ae..2cc80a6e5565d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -28,7 +28,7 @@ @pytest.fixture def dirpath(datapath): - return datapath("io", "data") + return datapath("io", "data", "stata") @pytest.fixture @@ -42,7 +42,7 @@ def parsed_114(dirpath): class TestStata: @pytest.fixture(autouse=True) def setup_method(self, datapath): - self.dirpath = datapath("io", "data") + self.dirpath = datapath("io", "data", "stata") self.dta1_114 = os.path.join(self.dirpath, "stata1_114.dta") self.dta1_117 = os.path.join(self.dirpath, "stata1_117.dta") @@ -383,8 +383,7 @@ def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) - with tm.assert_produces_warning(FutureWarning): - encoded = read_stata(self.dta_encoding, encoding="latin-1") + encoded = read_stata(self.dta_encoding) result = encoded.kreis1849[0] expected = raw.kreis1849[0] @@ -392,10 +391,7 @@ def test_encoding(self, version): assert isinstance(result, str) with tm.ensure_clean() as path: - with tm.assert_produces_warning(FutureWarning): - encoded.to_stata( - path, write_index=False, version=version, encoding="latin-1" - ) + encoded.to_stata(path, write_index=False, version=version) reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded) diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index c84b78c79e771..9025f8c361a82 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -94,3 +94,11 @@ def test_setting_backend_without_plot_raises(): def test_no_matplotlib_ok(): with pytest.raises(ImportError): pandas.plotting._core._get_plot_backend("matplotlib") + + +def test_extra_kinds_ok(monkeypatch, restore_backend): + # https://github.com/pandas-dev/pandas/pull/28647 + monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend) + pandas.set_option("plotting.backend", "pandas_dummy_backend") + df = pandas.DataFrame({"A": [1, 2, 3]}) + df.plot(kind="not a real kind") diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 973bda8292b2a..f5161b481ca50 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -99,33 +99,12 @@ def test_nonnumeric_exclude(self): with pytest.raises(TypeError, match=msg): df["A"].plot() - def test_tsplot_deprecated(self): - from pandas.tseries.plotting import tsplot - - _, ax = self.plt.subplots() - ts = tm.makeTimeSeries() - - with tm.assert_produces_warning(FutureWarning): - tsplot(ts, self.plt.Axes.plot, ax=ax) - @pytest.mark.slow def test_tsplot(self): - from pandas.tseries.plotting import tsplot - _, ax = self.plt.subplots() ts = tm.makeTimeSeries() - def f(*args, **kwds): - with tm.assert_produces_warning(FutureWarning): - return tsplot(s, self.plt.Axes.plot, *args, **kwds) - - for s in self.period_ser: - _check_plot_works(f, s.index.freq, ax=ax, series=s) - - for s in self.datetime_ser: - _check_plot_works(f, s.index.freq.rule_code, ax=ax, series=s) - for s in self.period_ser: _check_plot_works(s.plot, ax=ax) @@ -194,17 +173,6 @@ def check_format_of_first_point(ax, expected_string): check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") tm.close() - # tsplot - from pandas.tseries.plotting import tsplot - - _, ax = self.plt.subplots() - with tm.assert_produces_warning(FutureWarning): - tsplot(annual, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, "t = 2014 y = 1.000000") - with tm.assert_produces_warning(FutureWarning): - tsplot(daily, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") - @pytest.mark.slow def test_line_plot_period_series(self): for s in self.period_ser: @@ -892,16 +860,6 @@ def test_to_weekly_resampling(self): for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - _, ax = self.plt.subplots() - from pandas.tseries.plotting import tsplot - - with tm.assert_produces_warning(FutureWarning): - tsplot(high, self.plt.Axes.plot, ax=ax) - with tm.assert_produces_warning(FutureWarning): - lines = tsplot(low, self.plt.Axes.plot, ax=ax) - for l in lines: - assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - @pytest.mark.slow def test_from_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") @@ -926,21 +884,6 @@ def test_from_weekly_resampling(self): tm.assert_numpy_array_equal(xdata, expected_h) tm.close() - _, ax = self.plt.subplots() - from pandas.tseries.plotting import tsplot - - with tm.assert_produces_warning(FutureWarning): - tsplot(low, self.plt.Axes.plot, ax=ax) - with tm.assert_produces_warning(FutureWarning): - lines = tsplot(high, self.plt.Axes.plot, ax=ax) - for l in lines: - assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - xdata = l.get_xdata(orig=False) - if len(xdata) == 12: # idxl lines - tm.assert_numpy_array_equal(xdata, expected_l) - else: - tm.assert_numpy_array_equal(xdata, expected_h) - @pytest.mark.slow def test_from_resampling_area_line_mixed(self): idxh = date_range("1/1/1999", periods=52, freq="W") diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 940cfef4058e0..c51cd0e92eb3c 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -266,7 +266,7 @@ def test_parallel_coordinates_with_sorted_labels(self): df = DataFrame( { - "feat": [i for i in range(30)], + "feat": list(range(30)), "class": [2 for _ in range(10)] + [3 for _ in range(10)] + [1 for _ in range(10)], @@ -279,8 +279,7 @@ def test_parallel_coordinates_with_sorted_labels(self): ) ordered_color_label_tuples = sorted(color_label_tuples, key=lambda x: x[1]) prev_next_tupels = zip( - [i for i in ordered_color_label_tuples[0:-1]], - [i for i in ordered_color_label_tuples[1:]], + list(ordered_color_label_tuples[0:-1]), list(ordered_color_label_tuples[1:]) ) for prev, nxt in prev_next_tupels: # labels and colors are ordered strictly increasing diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 4dfe561831ced..b0ef0c58ca65a 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -179,8 +179,8 @@ class TestIndexReductions: [ (0, 400, 3), (500, 0, -6), - (-10 ** 6, 10 ** 6, 4), - (10 ** 6, -10 ** 6, -4), + (-(10 ** 6), 10 ** 6, 4), + (10 ** 6, -(10 ** 6), -4), (0, 10, 20), ], ) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index dc72800227c0e..161581e16b6fe 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -112,6 +112,22 @@ def test_resample_empty_series(freq, empty_series, resample_method): tm.assert_series_equal(result, expected, check_dtype=False) +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("resample_method", ["count", "size"]) +def test_resample_count_empty_series(freq, empty_series, resample_method): + # GH28427 + result = getattr(empty_series.resample(freq), resample_method)() + + if isinstance(empty_series.index, PeriodIndex): + index = empty_series.index.asfreq(freq=freq) + else: + index = empty_series.index._shallow_copy(freq=freq) + expected = pd.Series([], dtype="int64", index=index, name=empty_series.name) + + tm.assert_series_equal(result, expected) + + @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) def test_resample_empty_dataframe(empty_frame, freq, resample_method): @@ -136,6 +152,44 @@ def test_resample_empty_dataframe(empty_frame, freq, resample_method): # test size for GH13212 (currently stays as df) +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_count_empty_dataframe(freq, empty_frame): + # GH28427 + + empty_frame = empty_frame.copy() + empty_frame["a"] = [] + + result = empty_frame.resample(freq).count() + + if isinstance(empty_frame.index, PeriodIndex): + index = empty_frame.index.asfreq(freq=freq) + else: + index = empty_frame.index._shallow_copy(freq=freq) + expected = pd.DataFrame({"a": []}, dtype="int64", index=index) + + tm.assert_frame_equal(result, expected) + + +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_size_empty_dataframe(freq, empty_frame): + # GH28427 + + empty_frame = empty_frame.copy() + empty_frame["a"] = [] + + result = empty_frame.resample(freq).size() + + if isinstance(empty_frame.index, PeriodIndex): + index = empty_frame.index.asfreq(freq=freq) + else: + index = empty_frame.index._shallow_copy(freq=freq) + expected = pd.Series([], dtype="int64", index=index) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) @pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"]) def test_resample_empty_dtypes(index, dtype, resample_method): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index cbdfbb7a3100b..8e1774d8ee5b7 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -247,10 +247,9 @@ def test_agg_consistency(): r = df.resample("3T") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = r[["A", "B", "C"]].agg({"r1": "mean", "r2": "sum"}) - result = r.agg({"r1": "mean", "r2": "sum"}) - tm.assert_frame_equal(result, expected, check_like=True) + msg = "nested renamer is not supported" + with pytest.raises(pd.core.base.SpecificationError, match=msg): + r.agg({"r1": "mean", "r2": "sum"}) # TODO: once GH 14008 is fixed, move these tests into @@ -307,26 +306,23 @@ def test_agg(): result = t["A"].aggregate(["mean", "sum"]) tm.assert_frame_equal(result, expected) - expected = pd.concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) + msg = "nested renamer is not supported" for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] ) for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t.aggregate( + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.aggregate( { "A": {"mean": "mean", "sum": "sum"}, "B": {"mean2": "mean", "sum2": "sum"}, } ) - tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( @@ -383,12 +379,10 @@ def test_agg_misc(): [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] ) + msg = "nested renamer is not supported" for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t[["A", "B"]].agg( - OrderedDict([("result1", np.sum), ("result2", np.mean)]) - ) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t[["A", "B"]].agg(OrderedDict([("result1", np.sum), ("result2", np.mean)])) # agg with different hows expected = pd.concat( @@ -408,21 +402,11 @@ def test_agg_misc(): # series like aggs for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t["A"].agg({"A": ["sum", "std"]}) - expected = pd.concat([t["A"].sum(), t["A"].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "std")]) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t["A"].agg({"A": ["sum", "std"]}) - expected = pd.concat( - [t["A"].agg(["sum", "std"]), t["A"].agg(["mean", "std"])], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] - ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) # errors # invalid names in the agg specification @@ -451,28 +435,20 @@ def test_agg_nested_dicts(): df.groupby(pd.Grouper(freq="2D")), ] - msg = r"cannot perform renaming for r(1|2) with a nested dictionary" + msg = "nested renamer is not supported" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) for t in cases: - expected = pd.concat( - [t["A"].mean(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] - ) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t[["A", "B"]].agg( + + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t[["A", "B"]].agg( {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} ) - tm.assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) def test_try_aggregate_non_existing_column(): diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 925eaac45045d..e477b7608ab93 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -624,7 +624,7 @@ def test_join_mixed_non_unique_index(self): def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range("2016-01-01", periods=16, freq="M") - df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) + df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) df2 = concat([df, df]) result = df.join(df2, how="inner", rsuffix="_df2") expected = DataFrame( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index dd51a1a6c8359..5f4e8323c7127 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -860,7 +860,7 @@ def test_merge_datetime64tz_with_dst_transition(self): def test_merge_non_unique_period_index(self): # GH #16871 index = pd.period_range("2016-01-01", periods=16, freq="M") - df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) + df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) df2 = concat([df, df]) result = df.merge(df2, left_index=True, right_index=True, how="inner") expected = DataFrame( diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 1d8d2add3840c..bce62571d55ec 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -195,6 +195,27 @@ def test_merge_right_vs_left(self, left, right, sort): tm.assert_frame_equal(merged_left_right, merge_right_left) + def test_merge_multiple_cols_with_mixed_cols_index(self): + # GH29522 + s = pd.Series( + range(6), + pd.MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]), + name="Amount", + ) + df = pd.DataFrame( + {"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0} + ) + result = pd.merge(df, s.reset_index(), on=["lev1", "lev2"]) + expected = pd.DataFrame( + { + "lev1": list("AAABBB"), + "lev2": [1, 2, 3, 1, 2, 3], + "col": [0] * 6, + "Amount": range(6), + } + ) + tm.assert_frame_equal(result, expected) + def test_compress_group_combinations(self): # ~ 40000000 possible unique groups diff --git a/pandas/tests/reshape/merge/test_pivot_old.py b/pandas/tests/reshape/merge/test_pivot_old.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 5c930e01c735d..323b3126c2461 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -27,7 +27,6 @@ isna, read_csv, ) -import pandas.core.common as com from pandas.tests.extension.decimal import to_decimal import pandas.util.testing as tm @@ -1264,7 +1263,7 @@ def test_concat_dict(self): "qux": DataFrame(np.random.randn(4, 3)), } - sorted_keys = com.dict_keys_to_ordered_list(frames) + sorted_keys = list(frames.keys()) result = concat(frames) expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) @@ -1879,7 +1878,7 @@ def test_concat_iterables(self): tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) class CustomIterator1: - def __len__(self): + def __len__(self) -> int: return 2 def __getitem__(self, index): @@ -2747,6 +2746,22 @@ def test_concat_categorical_tz(): tm.assert_series_equal(result, expected) +def test_concat_categorical_unchanged(): + # GH-12007 + # test fix for when concat on categorical and float + # coerces dtype categorical -> float + df = pd.DataFrame(pd.Series(["a", "b", "c"], dtype="category", name="A")) + ser = pd.Series([0, 1, 2], index=[0, 1, 3], name="B") + result = pd.concat([df, ser], axis=1) + expected = pd.DataFrame( + { + "A": pd.Series(["a", "b", "c", np.nan], dtype="category"), + "B": pd.Series([0, 1, np.nan, 2], dtype="float"), + } + ) + tm.assert_equal(result, expected) + + def test_concat_datetimeindex_freq(): # GH 3232 # Monotonic index result diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 4521f1bbf1a08..d6946ea41ed84 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -317,6 +317,22 @@ def test_melt_missing_columns_raises(self): ): multi.melt(["A"], ["F"], col_level=0) + def test_melt_mixed_int_str_id_vars(self): + # GH 29718 + df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) + result = melt(df, id_vars=[0, "a"], value_vars=["b", "d"]) + expected = DataFrame( + {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} + ) + tm.assert_frame_equal(result, expected) + + def test_melt_mixed_int_str_value_vars(self): + # GH 29718 + df = DataFrame({0: ["foo"], "a": ["bar"]}) + result = melt(df, value_vars=[0, "a"]) + expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]}) + tm.assert_frame_equal(result, expected) + class TestLreshape: def test_pairs(self): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index a8386d21ba27f..bd1d3d2d5bb63 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -581,23 +581,23 @@ def test_pivot_tz_in_values(self): df = pd.DataFrame( [ { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"), }, ] @@ -1656,6 +1656,24 @@ def test_categorical_margins_category(self, observed): table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) + def test_margins_casted_to_float(self, observed): + # GH 24893 + df = pd.DataFrame( + { + "A": [2, 4, 6, 8], + "B": [1, 4, 5, 8], + "C": [1, 3, 4, 6], + "D": ["X", "X", "Y", "Y"], + } + ) + + result = pd.pivot_table(df, index="D", margins=True) + expected = pd.DataFrame( + {"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]}, + index=pd.Index(["X", "Y", "All"], name="D"), + ) + tm.assert_frame_equal(result, expected) + def test_pivot_with_categorical(self, observed, ordered_fixture): # gh-21370 idx = [np.nan, "low", "high", "low", np.nan] diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 3bdf91cbf838b..73371c48f9370 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1044,6 +1044,7 @@ def test_add_sub_nat(self): assert NaT - p is NaT p = Period("NaT", freq="M") + assert p is NaT assert p + NaT is NaT assert NaT + p is NaT assert p - NaT is NaT @@ -1284,6 +1285,7 @@ def test_add_offset_nat(self): # freq is DateOffset for freq in ["A", "2A", "3A"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.YearEnd(2)]: assert p + o is NaT assert o + p is NaT @@ -1300,6 +1302,7 @@ def test_add_offset_nat(self): for freq in ["M", "2M", "3M"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p + o is NaT assert o + p is NaT @@ -1317,6 +1320,7 @@ def test_add_offset_nat(self): # freq is Tick for freq in ["D", "2D", "3D"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(5), offsets.Hour(24), @@ -1340,6 +1344,7 @@ def test_add_offset_nat(self): for freq in ["H", "2H", "3H"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(2), offsets.Hour(3), @@ -1439,6 +1444,7 @@ def test_sub_offset_nat(self): # freq is DateOffset for freq in ["A", "2A", "3A"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.YearEnd(2)]: assert p - o is NaT @@ -1453,6 +1459,7 @@ def test_sub_offset_nat(self): for freq in ["M", "2M", "3M"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p - o is NaT @@ -1468,6 +1475,7 @@ def test_sub_offset_nat(self): # freq is Tick for freq in ["D", "2D", "3D"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(5), offsets.Hour(24), @@ -1489,6 +1497,7 @@ def test_sub_offset_nat(self): for freq in ["H", "2H", "3H"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(2), offsets.Hour(3), @@ -1511,6 +1520,7 @@ def test_sub_offset_nat(self): @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_nat_ops(self, freq): p = Period("NaT", freq=freq) + assert p is NaT assert p + 1 is NaT assert 1 + p is NaT assert p - 1 is NaT diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 652dd34ca7ce2..f9fa80644d4b9 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -202,8 +202,6 @@ def test_constructor(self): base_expected = 1404205200000000000 # confirm base representation is correct - import calendar - assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected tests = [ @@ -275,8 +273,6 @@ def test_constructor_with_stringoffset(self): base_expected = 1404205200000000000 # confirm base representation is correct - import calendar - assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected tests = [ diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 424b0c9abdef8..250f48b7e711b 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -306,15 +306,14 @@ def test_astimezone(self, tzstr): @td.skip_if_windows def test_tz_convert_utc_with_system_utc(self): - from pandas._libs.tslibs.timezones import maybe_get_tz # from system utc to real utc - ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) # from system utc to real utc - ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index dffb957b8f3b0..db63e0bf9cd30 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -7,7 +7,6 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG -from pandas.compat import PY36 import pandas.util._test_decorators as td from pandas import NaT, Timestamp @@ -375,7 +374,6 @@ def test_replace_dst_border(self): expected = Timestamp("2013-11-3 03:00:00", tz="America/Chicago") assert result == expected - @pytest.mark.skipif(not PY36, reason="Fold not available until PY3.6") @pytest.mark.parametrize("fold", [0, 1]) @pytest.mark.parametrize("tz", ["dateutil/Europe/London", "Europe/London"]) def test_replace_dst_fold(self, fold, tz): diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index bcddcf843df06..60b89c01cc22d 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -86,8 +86,7 @@ def test_get(): 1764.0, 1849.0, 1936.0, - ], - dtype="object", + ] ), ) diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 5d74ad95be90d..7a24a45b4b6c2 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -233,7 +233,7 @@ def test_reorder_levels(self): def test_rename_axis_mapper(self): # GH 19978 mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) - s = Series([i for i in range(len(mi))], index=mi) + s = Series(list(range(len(mi))), index=mi) result = s.rename_axis(index={"ll": "foo"}) assert result.index.names == ["foo", "nn"] diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 79eaeaf051d2e..e25c4456147f7 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -655,11 +655,6 @@ def test_matmul(self): def test_clip(self, datetime_series): val = datetime_series.median() - with tm.assert_produces_warning(FutureWarning): - assert datetime_series.clip_lower(val).min() == val - with tm.assert_produces_warning(FutureWarning): - assert datetime_series.clip_upper(val).max() == val - assert datetime_series.clip(lower=val).min() == val assert datetime_series.clip(upper=val).max() == val @@ -678,10 +673,8 @@ def test_clip_types_and_nulls(self): for s in sers: thresh = s[2] - with tm.assert_produces_warning(FutureWarning): - lower = s.clip_lower(thresh) - with tm.assert_produces_warning(FutureWarning): - upper = s.clip_upper(thresh) + lower = s.clip(lower=thresh) + upper = s.clip(upper=thresh) assert lower[notna(lower)].min() == thresh assert upper[notna(upper)].max() == thresh assert list(isna(s)) == list(isna(lower)) @@ -703,12 +696,6 @@ def test_clip_against_series(self): # GH #6966 s = Series([1.0, 1.0, 4.0]) - threshold = Series([1.0, 2.0, 3.0]) - - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(s.clip_lower(threshold), Series([1.0, 2.0, 4.0])) - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(s.clip_upper(threshold), Series([1.0, 1.0, 3.0])) lower = Series([1.0, 2.0, 3.0]) upper = Series([1.5, 2.5, 3.5]) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 656bf5a0e8a44..1e4757ffecb5d 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -199,11 +199,6 @@ def test_constructor_dict_timedelta_index(self): ) self._assert_series_equal(result, expected) - def test_from_array_deprecated(self): - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - self.series_klass.from_array([1, 2, 3]) - def test_sparse_accessor_updates_on_inplace(self): s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]") s.drop([0, 1], inplace=True) @@ -261,11 +256,11 @@ def test_tab_completion_with_categorical(self): def get_dir(s): results = [r for r in s.cat.__dir__() if not r.startswith("_")] - return list(sorted(set(results))) + return sorted(set(results)) s = Series(list("aabbcde")).astype("category") results = get_dir(s) - tm.assert_almost_equal(results, list(sorted(set(ok_for_cat)))) + tm.assert_almost_equal(results, sorted(set(ok_for_cat))) @pytest.mark.parametrize( "index", diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 09c5247ef616a..bdbfa333ef33a 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -7,6 +7,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, isna from pandas.conftest import _get_cython_table_params +from pandas.core.base import SpecificationError import pandas.util.testing as tm @@ -157,7 +158,8 @@ def test_apply_dict_depr(self): columns=["A", "B", "C"], index=pd.date_range("1/1/2000", periods=10), ) - with tm.assert_produces_warning(FutureWarning): + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): tsdf.A.agg({"foo": ["sum", "mean"]}) @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) @@ -256,31 +258,17 @@ def test_demo(self): tm.assert_series_equal(result, expected) # nested renaming - with tm.assert_produces_warning(FutureWarning): - result = s.agg({"foo": ["min", "max"]}) - - expected = ( - DataFrame({"foo": [0, 5]}, index=["min", "max"]).unstack().rename("series") - ) - tm.assert_series_equal(result, expected) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"]}) def test_multiple_aggregators_with_dict_api(self): s = Series(range(6), dtype="int64", name="series") # nested renaming - with tm.assert_produces_warning(FutureWarning): - result = s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) - - expected = ( - DataFrame( - {"foo": [5.0, np.nan, 0.0, np.nan], "bar": [np.nan, 2.5, np.nan, 15.0]}, - columns=["foo", "bar"], - index=["max", "mean", "min", "sum"], - ) - .unstack() - .rename("series") - ) - tm.assert_series_equal(result.reindex_like(expected), expected) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) def test_agg_apply_evaluate_lambdas_the_same(self, string_series): # test that we are evaluating row-by-row first diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 4cbade2669cc6..34b11a0d008aa 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -7,7 +7,6 @@ from pandas._libs import lib from pandas._libs.tslib import iNaT -from pandas.compat import PY36 from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64tz_dtype from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel @@ -1048,10 +1047,7 @@ def test_constructor_dict_order(self): # order by value d = {"b": 1, "a": 0, "c": 2} result = Series(d) - if PY36: - expected = Series([1, 0, 2], index=list("bac")) - else: - expected = Series([0, 1, 2], index=list("abc")) + expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 9304e1c4fc157..d038df1747f73 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -208,20 +208,18 @@ def compare(s, name): # test limited display api def get_dir(s): results = [r for r in s.dt.__dir__() if not r.startswith("_")] - return list(sorted(set(results))) + return sorted(set(results)) s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") results = get_dir(s) - tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) - ) + tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) s = Series( period_range("20130101", periods=5, freq="D", name="xxx").astype(object) ) results = get_dir(s) tm.assert_almost_equal( - results, list(sorted(set(ok_for_period + ok_for_period_methods))) + results, sorted(set(ok_for_period + ok_for_period_methods)) ) # 11295 @@ -229,9 +227,7 @@ def get_dir(s): s = Series(pd.date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") s = s.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") results = get_dir(s) - tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) - ) + tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) exp_values = pd.date_range( "2015-01-01", "2016-01-01", freq="T", tz="UTC" ).tz_convert("America/Chicago") @@ -504,7 +500,7 @@ def test_strftime(self): s.iloc[0] = pd.NaT result = s.dt.strftime("%Y/%m/%d") expected = Series( - ["NaT", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + [np.nan, "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) @@ -554,6 +550,20 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data", + [ + DatetimeIndex(["2019-01-01", pd.NaT]), + PeriodIndex(["2019-01-01", pd.NaT], dtype="period[D]"), + ], + ) + def test_strftime_nat(self, data): + # GH 29578 + s = Series(data) + result = s.dt.strftime("%Y-%m-%d") + expected = Series(["2019-01-01", np.nan]) + tm.assert_series_equal(result, expected) + def test_valid_dt_with_missing_values(self): from datetime import date, time diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 4b03115c11cb3..ec0318b2af13a 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -44,12 +44,6 @@ def test_astype(self, dtype): assert as_typed.dtype == dtype assert as_typed.name == s.name - def test_asobject_deprecated(self): - s = Series(np.random.randn(5), name="foo") - with tm.assert_produces_warning(FutureWarning): - o = s.asobject - assert isinstance(o, np.ndarray) - def test_dtype(self, datetime_series): assert datetime_series.dtype == np.dtype("float64") @@ -62,11 +56,6 @@ def test_dtype(self, datetime_series): # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert datetime_series.ftypes == "float64:dense" - # GH18243 - Assert .get_ftype_counts is deprecated - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal( - datetime_series.get_ftype_counts(), Series(1, ["float64:dense"]) - ) @pytest.mark.parametrize("value", [np.nan, np.inf]) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 4a914e4fb0f2c..0f7e3e307ed19 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -85,7 +85,7 @@ def __ne__(self, other): with capsys.disabled(): li = [Foo(i) for i in range(5)] - s = Series(li, index=[i for i in range(5)]) + s = Series(li, index=list(range(5))) s.is_unique captured = capsys.readouterr() assert len(captured.err) == 0 diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 0751e1fb8b906..81bf1edbe86df 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -20,7 +20,6 @@ date_range, isna, ) -from pandas.core.series import remove_na import pandas.util.testing as tm @@ -48,11 +47,6 @@ def _simple_ts(start, end, freq="D"): class TestSeriesMissingData: - def test_remove_na_deprecation(self): - # see gh-16971 - with tm.assert_produces_warning(FutureWarning): - remove_na(Series([])) - def test_timedelta_fillna(self): # GH 3371 s = Series( diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 7d212ee7cd667..983560d68c28c 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -43,6 +43,42 @@ def test_logical_operators_bool_dtype_with_empty(self): expected = s_tft tm.assert_series_equal(res, expected) + @pytest.mark.parametrize( + "left, right, op, expected", + [ + ( + [True, False, np.nan], + [True, False, True], + operator.and_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.and_, + [True, False, False], + ), + ( + [True, False, np.nan], + [True, False, True], + operator.or_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.or_, + [True, False, True], + ), + ], + ) + def test_logical_operators_nans(self, left, right, op, expected): + # GH 13896 + result = op(Series(left), Series(right)) + expected = Series(expected) + + tm.assert_series_equal(result, expected) + def test_logical_operators_int_dtype_with_int_dtype(self): # GH#9016: support bitwise op for integer types diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index ebfd468e034f9..8018ecf03960c 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -293,6 +293,29 @@ def test_replace_categorical(self, categorical, numeric): expected = pd.Series(numeric) tm.assert_series_equal(expected, result, check_dtype=False) + def test_replace_categorical_single(self): + # GH 26988 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + s = pd.Series(dti) + c = s.astype("category") + + expected = c.copy() + expected = expected.cat.add_categories("foo") + expected[2] = "foo" + expected = expected.cat.remove_unused_categories() + assert c[2] != "foo" + + result = c.replace(c[2], "foo") + tm.assert_series_equal(expected, result) + assert c[2] != "foo" # ensure non-inplace call does not alter original + + c.replace(c[2], "foo", inplace=True) + tm.assert_series_equal(expected, c) + + first_value = c[0] + c.replace(c[1], c[0], inplace=True) + assert c[0] == c[1] == first_value # test replacing with existing value + def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 7154975c6c73b..1587ae5eb7d07 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -370,6 +370,16 @@ def test_pct_change(self, datetime_series): rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) ) + def test_pct_change_with_duplicate_axis(self): + # GH 28664 + common_idx = date_range("2019-11-14", periods=5, freq="D") + result = Series(range(5), common_idx).pct_change(freq="B") + + # the reason that the expected should be like this is documented at PR 28681 + expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) + + tm.assert_series_equal(result, expected) + def test_pct_change_shift_over_nas(self): s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) @@ -594,7 +604,7 @@ def test_asfreq_keep_index_name(self): # GH #9854 index_name = "bar" index = pd.date_range("20130101", periods=20, name=index_name) - df = pd.DataFrame([x for x in range(20)], columns=["foo"], index=index) + df = pd.DataFrame(list(range(20)), columns=["foo"], index=index) assert index_name == df.index.name assert index_name == df.asfreq("10D").index.name @@ -1030,10 +1040,6 @@ def test_from_M8_structured(self): assert isinstance(s[0], Timestamp) assert s[0] == dates[0][0] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = Series.from_array(arr["Date"], Index([0])) - assert s[0] == dates[0][0] - def test_get_level_values_box(self): from pandas import MultiIndex diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ef844dd97120a..9e89a1b6f0467 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -10,6 +10,13 @@ from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_complex_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype as CDT import pandas as pd @@ -23,6 +30,7 @@ Timestamp, compat, ) +from pandas.conftest import BYTES_DTYPES, STRING_DTYPES import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com @@ -215,10 +223,10 @@ def test_uint64_factorize(self, writable): tm.assert_numpy_array_equal(uniques, expected_uniques) def test_int64_factorize(self, writable): - data = np.array([2 ** 63 - 1, -2 ** 63, 2 ** 63 - 1], dtype=np.int64) + data = np.array([2 ** 63 - 1, -(2 ** 63), 2 ** 63 - 1], dtype=np.int64) data.setflags(write=writable) expected_codes = np.array([0, 1, 0], dtype=np.intp) - expected_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64) + expected_uniques = np.array([2 ** 63 - 1, -(2 ** 63)], dtype=np.int64) codes, uniques = algos.factorize(data) tm.assert_numpy_array_equal(codes, expected_codes) @@ -248,7 +256,7 @@ def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. data = np.array([2 ** 63, 1, 2 ** 63], dtype=np.uint64) - with tm.assert_produces_warning(expected_warning=FutureWarning): + with pytest.raises(TypeError, match="got an unexpected keyword"): algos.factorize(data, order=True) with tm.assert_produces_warning(False): algos.factorize(data) @@ -257,7 +265,7 @@ def test_deprecate_order(self): "data", [ np.array([0, 1, 0], dtype="u8"), - np.array([-2 ** 63, 1, -2 ** 63], dtype="i8"), + np.array([-(2 ** 63), 1, -(2 ** 63)], dtype="i8"), np.array(["__nan__", "foo", "__nan__"], dtype="object"), ], ) @@ -274,8 +282,8 @@ def test_parametrized_factorize_na_value_default(self, data): [ (np.array([0, 1, 0, 2], dtype="u8"), 0), (np.array([1, 0, 1, 2], dtype="u8"), 1), - (np.array([-2 ** 63, 1, -2 ** 63, 0], dtype="i8"), -2 ** 63), - (np.array([1, -2 ** 63, 1, 0], dtype="i8"), 1), + (np.array([-(2 ** 63), 1, -(2 ** 63), 0], dtype="i8"), -(2 ** 63)), + (np.array([1, -(2 ** 63), 1, 0], dtype="i8"), 1), (np.array(["a", "", "a", "b"], dtype=object), "a"), (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()), (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)), @@ -352,6 +360,35 @@ def test_on_index_object(self): tm.assert_almost_equal(result, expected) + def test_dtype_preservation(self, any_numpy_dtype): + # GH 15442 + if any_numpy_dtype in (BYTES_DTYPES + STRING_DTYPES): + pytest.skip("skip string dtype") + elif is_integer_dtype(any_numpy_dtype): + data = [1, 2, 2] + uniques = [1, 2] + elif is_float_dtype(any_numpy_dtype): + data = [1, 2, 2] + uniques = [1.0, 2.0] + elif is_complex_dtype(any_numpy_dtype): + data = [complex(1, 0), complex(2, 0), complex(2, 0)] + uniques = [complex(1, 0), complex(2, 0)] + elif is_bool_dtype(any_numpy_dtype): + data = [True, True, False] + uniques = [True, False] + elif is_object_dtype(any_numpy_dtype): + data = ["A", "B", "B"] + uniques = ["A", "B"] + else: + # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere + data = [1, 2, 2] + uniques = [1, 2] + + result = Series(data, dtype=any_numpy_dtype).unique() + expected = np.array(uniques, dtype=any_numpy_dtype) + + tm.assert_numpy_array_equal(result, expected) + def test_datetime64_dtype_array_returned(self): # GH 9431 expected = np_array_datetime64_compat( diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 21fed62e51fdf..58093ba4d90a5 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -179,7 +179,7 @@ def setup_method(self, method): self.int_series = Series(arr, index=self.int_index, name="a") self.float_series = Series(arr, index=self.float_index, name="a") self.dt_series = Series(arr, index=self.dt_index, name="a") - self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) + self.dt_tz_series = self.dt_tz_index.to_series() self.period_series = Series(arr, index=self.period_index, name="a") self.string_series = Series(arr, index=self.string_index, name="a") self.unicode_series = Series(arr, index=self.unicode_index, name="a") @@ -653,7 +653,7 @@ def test_value_counts_datetime64(self, klass): # with NaT s = df["dt"].copy() - s = klass([v for v in s.values] + [pd.NaT]) + s = klass(list(s.values) + [pd.NaT]) result = s.value_counts() assert result.index.dtype == "datetime64[ns]" diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index fb0511f8902f7..dc88ebe1f7f8e 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,8 +8,6 @@ import numpy as np # noqa import pytest -from pandas.compat import PY36 - from pandas import DataFrame, Series import pandas.util.testing as tm @@ -18,19 +16,10 @@ def import_module(name): # we *only* want to skip if the module is truly not available # and NOT just an actual import error because of pandas changes - if PY36: - try: - return importlib.import_module(name) - except ModuleNotFoundError: # noqa - pytest.skip("skipping as {} not available".format(name)) - - else: - try: - return importlib.import_module(name) - except ImportError as e: - if "No module named" in str(e) and name in str(e): - pytest.skip("skipping as {} not available".format(name)) - raise + try: + return importlib.import_module(name) + except ModuleNotFoundError: # noqa + pytest.skip("skipping as {} not available".format(name)) @pytest.fixture diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 3b194044131a8..44829423be1bb 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -257,7 +257,7 @@ def test_repr_name_coincide(self): assert lines[2].startswith("a 0 foo") def test_delevel_infer_dtype(self): - tuples = [tuple for tuple in product(["foo", "bar"], [10, 20], [1.0, 1.1])] + tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1])) index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) deleveled = df.reset_index() @@ -363,19 +363,19 @@ def test_unstack(self): [ ( [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]], - [u"ix1", u"ix2", u"col1", u"col2", u"col3", u"col4"], + ["ix1", "ix2", "col1", "col2", "col3", "col4"], 2, [None, None, 30.0, None], ), ( [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]], - [u"ix1", u"ix2", u"col1", u"col2", u"col3"], + ["ix1", "ix2", "col1", "col2", "col3"], 2, [None, None, 30.0], ), ( [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]], - [u"ix1", u"ix2", u"col1", u"col2", u"col3"], + ["ix1", "ix2", "col1", "col2", "col3"], None, [None, None, 30.0], ), @@ -389,7 +389,7 @@ def test_unstack_partial( # make sure DataFrame.unstack() works when its run on a subset of the DataFrame # and the Index levels contain values that are not present in the subset result = pd.DataFrame(result_rows, columns=result_columns).set_index( - [u"ix1", "ix2"] + ["ix1", "ix2"] ) result = result.iloc[1:2].unstack("ix2") expected = pd.DataFrame( @@ -583,6 +583,17 @@ def test_stack_unstack_wrong_level_name(self, method): with pytest.raises(KeyError, match="does not match index name"): getattr(s, method)("mistake") + def test_unused_level_raises(self): + # GH 20410 + mi = MultiIndex( + levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], + codes=[[1, 0], [1, 0]], + ) + df = DataFrame(-1, index=range(3), columns=mi) + + with pytest.raises(KeyError, match="notevenone"): + df["notevenone"] + def test_unstack_level_name(self): result = self.frame.unstack("second") expected = self.frame.unstack(level=1) @@ -1989,6 +2000,15 @@ def test_repeat(self): m_df = Series(data, index=m_idx) assert m_df.repeat(3).shape == (3 * len(data),) + def test_subsets_multiindex_dtype(self): + # GH 20757 + data = [["x", 1]] + columns = [("a", "b", np.nan), ("a", "c", 0.0)] + df = DataFrame(data, columns=pd.MultiIndex.from_tuples(columns)) + expected = df.dtypes.a.b + result = df.a.b.dtypes + tm.assert_series_equal(result, expected) + class TestSorted(Base): """ everything you wanted to test about sorting """ diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 52ad56967220f..e5d963a307502 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -151,7 +151,7 @@ def check_fun_data( targarval, check_dtype=True, empty_targfunc=None, - **kwargs + **kwargs, ): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: @@ -186,7 +186,7 @@ def check_fun_data( targarval2, check_dtype=check_dtype, empty_targfunc=empty_targfunc, - **kwargs + **kwargs, ) def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): @@ -203,7 +203,7 @@ def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): testarval, targarval, empty_targfunc=empty_targfunc, - **kwargs + **kwargs, ) def check_funs( @@ -215,7 +215,7 @@ def check_funs( allow_date=True, allow_tdelta=True, allow_obj=True, - **kwargs + **kwargs, ): self.check_fun(testfunc, targfunc, "arr_float", **kwargs) self.check_fun(testfunc, targfunc, "arr_float_nan", **kwargs) @@ -302,7 +302,7 @@ def test_nanmean_overflow(self): # In the previous implementation mean can overflow for int dtypes, it # is now consistent with numpy - for a in [2 ** 55, -2 ** 55, 20150515061816532]: + for a in [2 ** 55, -(2 ** 55), 20150515061816532]: s = Series(a, index=range(500), dtype=np.int64) result = s.mean() np_result = s.values.mean() @@ -476,7 +476,7 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_2d, self.arr_float1_2d, min_periods=len(self.arr_float_2d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) @@ -486,7 +486,7 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_2d, self.arr_float1_nan_2d, min_periods=len(self.arr_float_2d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -500,13 +500,13 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_2d, self.arr_nan_float1_2d, min_periods=len(self.arr_float_2d) - 1, - **kwargs + **kwargs, ) res25 = checkfun( self.arr_float_2d, self.arr_float1_2d, min_periods=len(self.arr_float_2d) + 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) @@ -521,7 +521,7 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_1d, self.arr_float1_1d, min_periods=len(self.arr_float_1d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) @@ -531,7 +531,7 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_1d, self.arr_float1_nan_1d, min_periods=len(self.arr_float_1d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -545,13 +545,13 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_1d, self.arr_nan_float1_1d, min_periods=len(self.arr_float_1d) - 1, - **kwargs + **kwargs, ) res25 = checkfun( self.arr_float_1d, self.arr_float1_1d, min_periods=len(self.arr_float_1d) + 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 5d7eb70817a11..90cd9cc3e006d 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -314,27 +314,27 @@ def verify_order(df): def test_decons(): - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) + def testit(codes_list, shape): + group_index = get_group_index(codes_list, shape, sort=True, xnull=True) + codes_list2 = decons_group_index(group_index, shape) - for a, b in zip(label_list, label_list2): + for a, b in zip(codes_list, codes_list2): tm.assert_numpy_array_equal(a, b) shape = (4, 5, 6) - label_list = [ + codes_list = [ np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), ] - testit(label_list, shape) + testit(codes_list, shape) shape = (10000, 10000) - label_list = [ + codes_list = [ np.tile(np.arange(10000, dtype=np.int64), 5), np.tile(np.arange(10000, dtype=np.int64), 5), ] - testit(label_list, shape) + testit(codes_list, shape) class TestSafeSort: @@ -355,42 +355,42 @@ def test_basic_sort(self): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) - def test_labels(self, verify): + def test_codes(self, verify): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) - labels = [0, 1, 1, 2, 3, 0, -1, 4] - result, result_labels = safe_sort(values, labels, verify=verify) - expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) + codes = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) # na_sentinel - labels = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=99, verify=verify) - expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) + codes = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=99, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) - labels = [] - result, result_labels = safe_sort(values, labels, verify=verify) - expected_labels = np.array([], dtype=np.intp) + codes = [] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_labels_out_of_bound(self, na_sentinel): + def test_codes_out_of_bound(self, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) # out of bound indices - labels = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=na_sentinel) - expected_labels = np.array( + codes = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel) + expected_codes = np.array( [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp ) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) def test_mixed_integer(self): values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) @@ -399,12 +399,12 @@ def test_mixed_integer(self): tm.assert_numpy_array_equal(result, expected) values = np.array(["b", 1, 0, "a"], dtype=object) - labels = [0, 1, 2, 3, 0, -1, 1] - result, result_labels = safe_sort(values, labels) + codes = [0, 1, 2, 3, 0, -1, 1] + result, result_codes = safe_sort(values, codes) expected = np.array([0, 1, "a", "b"], dtype=object) - expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) + expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) def test_mixed_integer_from_list(self): values = ["b", 1, 0, "a", 0, "b"] @@ -428,10 +428,10 @@ def test_exceptions(self): safe_sort(values=1) with pytest.raises(TypeError, match="Only list-like objects or None"): - safe_sort(values=[0, 1, 2], labels=1) + safe_sort(values=[0, 1, 2], codes=1) with pytest.raises(ValueError, match="values should be unique"): - safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) + safe_sort(values=[0, 1, 2, 1], codes=[0, 1]) def test_extension_array(self): # a = array([1, 3, np.nan, 2], dtype='Int64') @@ -443,12 +443,12 @@ def test_extension_array(self): @pytest.mark.parametrize("verify", [True, False]) @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_extension_array_labels(self, verify, na_sentinel): + def test_extension_array_codes(self, verify, na_sentinel): a = array([1, 3, 2], dtype="Int64") - result, labels = safe_sort( + result, codes = safe_sort( a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify ) expected_values = array([1, 2, 3], dtype="Int64") - expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp) + expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f5d28ec82d1d4..1261c3bbc86db 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -731,7 +731,10 @@ def test_count(self): tm.assert_series_equal(result, exp) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_count(mixed, "a") xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) tm.assert_numpy_array_equal(rs, xp) @@ -755,14 +758,14 @@ def test_contains(self): expected = np.array([False, np.nan, False, False, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) - values = ["foo", "xyz", "fooommm__foo", "mmm_"] + values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object) result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) # case insensitive using regex - values = ["Foo", "xYz", "fOOomMm__fOo", "MMM_"] + values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object) result = strings.str_contains(values, "FOO|mmm", case=False) expected = np.array([True, False, True, True]) tm.assert_numpy_array_equal(result, expected) @@ -773,7 +776,10 @@ def test_contains(self): tm.assert_numpy_array_equal(result, expected) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_contains(mixed, "o") xp = np.array( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], @@ -869,7 +875,10 @@ def test_endswith(self): tm.assert_series_equal(result, exp.fillna(False).astype(bool)) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_endswith(mixed, "f") xp = np.array( [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan], @@ -1853,15 +1862,16 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.get(0)) tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) - tm.assert_series_equal(empty_str, empty.str.isalnum()) - tm.assert_series_equal(empty_str, empty.str.isalpha()) - tm.assert_series_equal(empty_str, empty.str.isdigit()) - tm.assert_series_equal(empty_str, empty.str.isspace()) - tm.assert_series_equal(empty_str, empty.str.islower()) - tm.assert_series_equal(empty_str, empty.str.isupper()) - tm.assert_series_equal(empty_str, empty.str.istitle()) - tm.assert_series_equal(empty_str, empty.str.isnumeric()) - tm.assert_series_equal(empty_str, empty.str.isdecimal()) + # ismethods should always return boolean (GH 29624) + tm.assert_series_equal(empty_bool, empty.str.isalnum()) + tm.assert_series_equal(empty_bool, empty.str.isalpha()) + tm.assert_series_equal(empty_bool, empty.str.isdigit()) + tm.assert_series_equal(empty_bool, empty.str.isspace()) + tm.assert_series_equal(empty_bool, empty.str.islower()) + tm.assert_series_equal(empty_bool, empty.str.isupper()) + tm.assert_series_equal(empty_bool, empty.str.istitle()) + tm.assert_series_equal(empty_bool, empty.str.isnumeric()) + tm.assert_series_equal(empty_bool, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) @@ -3488,10 +3498,13 @@ def test_casefold(self): def test_string_array(any_string_method): + method_name, args, kwargs = any_string_method + if method_name == "decode": + pytest.skip("decode requires bytes.") + data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype="string") - method_name, args, kwargs = any_string_method expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) @@ -3502,8 +3515,29 @@ def test_string_array(any_string_method): ): assert result.dtype == "string" result = result.astype(object) + + elif expected.dtype == "float" and expected.isna().any(): + assert result.dtype == "Int64" + result = result.astype("float") + elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "method,expected", + [ + ("count", [2, None]), + ("find", [0, None]), + ("index", [0, None]), + ("rindex", [2, None]), + ], +) +def test_string_array_numeric_integer_array(method, expected): + s = Series(["aba", None], dtype="string") + result = getattr(s.str, method)("a") + expected = Series(expected, dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index bed8d2461f65d..d70780741aa88 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,5 +1,5 @@ from datetime import date, datetime, time as dt_time, timedelta -from typing import Dict, List, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import numpy as np import pytest @@ -95,7 +95,7 @@ def test_to_M8(): class Base: - _offset = None # type: Type[DateOffset] + _offset: Optional[Type[DateOffset]] = None d = Timestamp(datetime(2008, 1, 2)) timezones = [ @@ -743,7 +743,7 @@ def test_onOffset(self): for offset, d, expected in tests: assert_onOffset(offset, d, expected) - apply_cases = [] # type: _ApplyCases + apply_cases: _ApplyCases = [] apply_cases.append( ( BDay(), @@ -2631,7 +2631,7 @@ def test_onOffset(self, case): offset, d, expected = case assert_onOffset(offset, d, expected) - apply_cases = [] # type: _ApplyCases + apply_cases: _ApplyCases = [] apply_cases.append( ( CDay(), @@ -2878,7 +2878,7 @@ def test_onOffset(self, case): offset, d, expected = case assert_onOffset(offset, d, expected) - apply_cases = [] # type: _ApplyCases + apply_cases: _ApplyCases = [] apply_cases.append( ( CBMonthEnd(), @@ -3027,7 +3027,7 @@ def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) - apply_cases = [] # type: _ApplyCases + apply_cases: _ApplyCases = [] apply_cases.append( ( CBMonthBegin(), @@ -4253,7 +4253,7 @@ def test_valid_default_arguments(offset_types): cls() -@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) def test_valid_month_attributes(kwd, month_classes): # GH#18226 cls = month_classes @@ -4262,14 +4262,14 @@ def test_valid_month_attributes(kwd, month_classes): cls(**{kwd: 3}) -@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) def test_valid_relativedelta_kwargs(kwd): # Check that all the arguments specified in liboffsets.relativedelta_kwds # are in fact valid relativedelta keyword args DateOffset(**{kwd: 1}) -@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) def test_valid_tick_attributes(kwd, tick_classes): # GH#18226 cls = tick_classes diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index ec9f3948403de..b6241def4e5d6 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -60,9 +60,9 @@ def test_validation(): @pytest.mark.parametrize("name", ["inplace", "copy"]) @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_kwarg_fail(name, value): - msg = 'For argument "%s" expected type bool, received type %s' % ( - name, - type(value).__name__, + msg = ( + f'For argument "{name}" expected type bool,' + f" received type {type(value).__name__}" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 11527efa4c39f..5085576cc96f0 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -1,6 +1,4 @@ from collections import OrderedDict -import warnings -from warnings import catch_warnings import numpy as np import pytest @@ -82,7 +80,6 @@ def test_agg(self): a_sum = r["A"].sum() b_mean = r["B"].mean() b_std = r["B"].std() - b_sum = r["B"].sum() result = r.aggregate([np.mean, np.std]) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) @@ -104,26 +101,18 @@ def test_agg(self): expected.columns = ["mean", "sum"] tm.assert_frame_equal(result, expected) - with catch_warnings(record=True): + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): # using a dict with renaming - warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - expected = concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) - tm.assert_frame_equal(result, expected, check_like=True) + r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate( + with pytest.raises(SpecificationError, match=msg): + r.aggregate( { "A": {"mean": "mean", "sum": "sum"}, "B": {"mean2": "mean", "sum2": "sum"}, } ) - expected = concat([a_mean, a_sum, b_mean, b_sum], axis=1) - exp_cols = [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] - expected.columns = pd.MultiIndex.from_tuples(exp_cols) - tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) @@ -168,7 +157,7 @@ def test_agg_nested_dicts(self): df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) - msg = r"cannot perform renaming for (r1|r2) with a nested dictionary" + msg = "nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): r.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) @@ -178,25 +167,13 @@ def test_agg_nested_dicts(self): expected.columns = pd.MultiIndex.from_tuples( [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] ) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r[["A", "B"]].agg( + with pytest.raises(SpecificationError, match=msg): + r[["A", "B"]].agg( {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} ) - tm.assert_frame_equal(result, expected, check_like=True) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) - expected.columns = pd.MultiIndex.from_tuples( - [ - ("A", "ra", "mean"), - ("A", "ra", "std"), - ("B", "rb", "mean"), - ("B", "rb", "std"), - ] - ) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) def test_count_nonnumeric_types(self): # GH12541 diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index b726bd3e3c8a7..189942bc07d2a 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -3,6 +3,7 @@ import pandas as pd from pandas import DataFrame, Series +from pandas.core.groupby.groupby import get_groupby import pandas.util.testing as tm @@ -13,18 +14,18 @@ def setup_method(self, method): def test_mutated(self): - msg = r"group\(\) got an unexpected keyword argument 'foo'" + msg = r"groupby\(\) got an unexpected keyword argument 'foo'" with pytest.raises(TypeError, match=msg): self.frame.groupby("A", foo=1) g = self.frame.groupby("A") assert not g.mutated - g = self.frame.groupby("A", mutated=True) + g = get_groupby(self.frame, by="A", mutated=True) assert g.mutated def test_getitem(self): g = self.frame.groupby("A") - g_mutated = self.frame.groupby("A", mutated=True) + g_mutated = get_groupby(self.frame, by="A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) @@ -45,7 +46,7 @@ def test_getitem_multiple(self): # GH 13174 g = self.frame.groupby("A") r = g.rolling(2) - g_mutated = self.frame.groupby("A", mutated=True) + g_mutated = get_groupby(self.frame, by="A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) result = r.B.count() @@ -59,7 +60,6 @@ def test_rolling(self): r = g.rolling(window=4) for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: - result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) tm.assert_frame_equal(result, expected) @@ -69,8 +69,16 @@ def test_rolling(self): expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) tm.assert_frame_equal(result, expected) - result = r.quantile(0.5) - expected = g.apply(lambda x: x.rolling(4).quantile(0.5)) + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] + ) + def test_rolling_quantile(self, interpolation): + g = self.frame.groupby("A") + r = g.rolling(window=4) + result = r.quantile(0.4, interpolation=interpolation) + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) tm.assert_frame_equal(result, expected) def test_rolling_corr_cov(self): @@ -141,8 +149,16 @@ def test_expanding(self): expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) tm.assert_frame_equal(result, expected) - result = r.quantile(0.5) - expected = g.apply(lambda x: x.expanding().quantile(0.5)) + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] + ) + def test_expanding_quantile(self, interpolation): + g = self.frame.groupby("A") + r = g.expanding() + result = r.quantile(0.4, interpolation=interpolation) + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) tm.assert_frame_equal(result, expected) def test_expanding_corr_cov(self): diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/test_moments.py index 36a0ddb3e02d7..6e4bc621d7f49 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/test_moments.py @@ -800,7 +800,7 @@ def _check_moment_func( has_time_rule=True, fill_value=None, zero_min_periods_equal=True, - **kwargs + **kwargs, ): # inject raw diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index c237b094a0e01..898060d011372 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -49,7 +49,7 @@ # Offset names ("time rules") and related functions #: cache of previously seen offsets -_offset_map = {} # type: Dict[str, DateOffset] +_offset_map: Dict[str, DateOffset] = {} def get_period_alias(offset_str): @@ -308,7 +308,7 @@ def deltas_asi8(self): return unique_deltas(self.index.asi8) @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: return len(self.deltas) == 1 @cache_readonly diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index d4f02286ff8d6..9417dc4b48499 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -344,7 +344,7 @@ class AbstractHolidayCalendar(metaclass=HolidayCalendarMetaClass): Abstract interface to create holidays following certain rules. """ - rules = [] # type: List[Holiday] + rules: List[Holiday] = [] start_date = Timestamp(datetime(1970, 1, 1)) end_date = Timestamp(datetime(2200, 12, 31)) _cache = None diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index f5e40e712642e..e516d30d5490f 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1817,8 +1817,8 @@ class QuarterOffset(DateOffset): Quarter representation - doesn't call super. """ - _default_startingMonth = None # type: Optional[int] - _from_name_startingMonth = None # type: Optional[int] + _default_startingMonth: Optional[int] = None + _from_name_startingMonth: Optional[int] = None _adjust_dst = True _attributes = frozenset(["n", "normalize", "startingMonth"]) # TODO: Consider combining QuarterOffset and YearOffset __init__ at some diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py deleted file mode 100644 index df41b4b5b40d9..0000000000000 --- a/pandas/tseries/plotting.py +++ /dev/null @@ -1,3 +0,0 @@ -# flake8: noqa - -from pandas.plotting._matplotlib.timeseries import tsplot diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index f8c08ed8c099f..b8f17cd848292 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -327,9 +327,11 @@ def my_dog(has='fleas'): pass """ + addendum: Optional[str] + def __init__(self, addendum: Optional[str], join: str = "", indents: int = 0): if indents > 0: - self.addendum = indent(addendum, indents=indents) # type: Optional[str] + self.addendum = indent(addendum, indents=indents) else: self.addendum = addendum self.join = join diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index b516c3d78a11e..b9c165140aaad 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -199,7 +199,7 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: def skip_if_np_lt(ver_str, reason=None, *args, **kwds): if reason is None: - reason = "NumPy %s or greater required" % ver_str + reason = f"NumPy {ver_str} or greater required" return pytest.mark.skipif( _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds ) diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 0f5324c8d02ba..7822ecdeeb4d8 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -11,7 +11,7 @@ def test(extra_args=None): try: import pytest except ImportError: - raise ImportError("Need pytest>=4.0.2 to run tests") + raise ImportError("Need pytest>=5.0.1 to run tests") try: import hypothesis # noqa except ImportError: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4ba32c377a345..bcd12eba1651a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -316,7 +316,7 @@ def assert_almost_equal( check_exact=False, exact=check_dtype, check_less_precise=check_less_precise, - **kwargs + **kwargs, ) elif isinstance(left, pd.Series): @@ -326,7 +326,7 @@ def assert_almost_equal( check_exact=False, check_dtype=check_dtype, check_less_precise=check_less_precise, - **kwargs + **kwargs, ) elif isinstance(left, pd.DataFrame): @@ -336,7 +336,7 @@ def assert_almost_equal( check_exact=False, check_dtype=check_dtype, check_less_precise=check_less_precise, - **kwargs + **kwargs, ) else: @@ -359,7 +359,7 @@ def assert_almost_equal( right, check_dtype=check_dtype, check_less_precise=check_less_precise, - **kwargs + **kwargs, ) @@ -1952,7 +1952,7 @@ def keyfunc(x): label = "{prefix}_l{i}_g{j}".format(prefix=prefix, i=i, j=j) cnt[label] = ndupe_l[i] # cute Counter trick - result = list(sorted(cnt.elements(), key=keyfunc))[:nentries] + result = sorted(cnt.elements(), key=keyfunc)[:nentries] tuples.append(result) tuples = list(zip(*tuples)) diff --git a/pyproject.toml b/pyproject.toml index b105f8aeb3291..28d7c3d55c919 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,3 +10,23 @@ requires = [ "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'", "numpy==1.16.0; python_version>='3.7' and platform_system=='AIX'", ] + +[tool.black] +target-version = ['py36', 'py37', 'py38'] +exclude = ''' +( + asv_bench/env + | \.egg + | \.git + | \.hg + | \.mypy_cache + | \.nox + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | setup.py +) +''' diff --git a/requirements-dev.txt b/requirements-dev.txt index 13e2c95126f0c..4d0e7ee904294 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,13 +3,13 @@ python-dateutil>=2.6.1 pytz asv cython>=0.29.13 -black<=19.3b0 +black==19.10b0 cpplint flake8 -flake8-comprehensions +flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 isort -mypy==0.720 +mypy==0.730 pycodestyle gitpython sphinx @@ -30,9 +30,9 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto -pytest>=4.0.2 +pytest>=5.0.1 pytest-cov -pytest-xdist +pytest-xdist>=1.21 seaborn statsmodels ipywidgets @@ -48,7 +48,7 @@ matplotlib>=2.2.2 numexpr>=2.6.8 scipy>=1.1 beautifulsoup4>=4.6.0 -fastparquet>=0.2.1 +fastparquet>=0.3.2 html5lib lxml openpyxl diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 1d0f4b583bd0c..7c6f2fea97933 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -250,7 +250,7 @@ def __init__(self, name): self.clean_doc = pydoc.getdoc(obj) self.doc = NumpyDocString(self.clean_doc) - def __len__(self): + def __len__(self) -> int: return len(self.raw_doc) @staticmethod diff --git a/setup.cfg b/setup.cfg index 10670a4eae387..46e6b88f8018a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -145,10 +145,13 @@ ignore_errors=True [mypy-pandas.tests.extension.json.test_json] ignore_errors=True +[mypy-pandas.tests.indexes.datetimes.test_tools] +ignore_errors=True + [mypy-pandas.tests.indexes.test_base] ignore_errors=True -[mypy-pandas.tests.indexing.test_loc] +[mypy-pandas.tests.scalar.period.test_period] ignore_errors=True [mypy-pandas.tests.series.test_operators] diff --git a/setup.py b/setup.py index a7bc7a333cdd6..e6a95d4e7afd8 100755 --- a/setup.py +++ b/setup.py @@ -83,10 +83,7 @@ def is_platform_mac(): _pxi_dep_template = { - "algos": [ - "_libs/algos_common_helper.pxi.in", - "_libs/algos_take_helper.pxi.in", - ], + "algos": ["_libs/algos_common_helper.pxi.in", "_libs/algos_take_helper.pxi.in"], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", @@ -347,12 +344,13 @@ class CheckSDist(sdist_class): "pandas/_libs/tslibs/resolution.pyx", "pandas/_libs/tslibs/parsing.pyx", "pandas/_libs/tslibs/tzconversion.pyx", + "pandas/_libs/window/indexers.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", ] _cpp_pyxfiles = [ - "pandas/_libs/window.pyx", + "pandas/_libs/window/aggregations.pyx", "pandas/io/msgpack/_packer.pyx", "pandas/io/msgpack/_unpacker.pyx", ] @@ -465,7 +463,7 @@ def run(self): extra_link_args.append("/DEBUG") else: # args to ignore warnings - extra_compile_args = ["-Wno-unused-function"] + extra_compile_args = [] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") @@ -544,7 +542,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ts_include = ["pandas/_libs/tslibs/src", "pandas/_libs/tslibs"] -lib_depends = ["pandas/_libs/src/parse_helper.h", "pandas/_libs/src/compat_helper.h"] +lib_depends = ["pandas/_libs/src/parse_helper.h"] np_datetime_headers = [ "pandas/_libs/tslibs/src/datetime/np_datetime.h", @@ -685,7 +683,12 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "sources": np_datetime_sources, }, "_libs.testing": {"pyxfile": "_libs/testing"}, - "_libs.window": {"pyxfile": "_libs/window", "language": "c++", "suffix": ".cpp"}, + "_libs.window.aggregations": { + "pyxfile": "_libs/window/aggregations", + "language": "c++", + "suffix": ".cpp" + }, + "_libs.window.indexers": {"pyxfile": "_libs/window/indexers"}, "_libs.writers": {"pyxfile": "_libs/writers"}, "io.sas._sas": {"pyxfile": "io/sas/sas"}, "io.msgpack._packer": { @@ -823,5 +826,5 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): entry_points={ "pandas_plotting_backends": ["matplotlib = pandas:plotting._matplotlib"] }, - **setuptools_kwargs + **setuptools_kwargs, )