Chesapeake: update to 2022 edition (#2214)

* Chesapeake: update to 2022 edition * Hard-code cmap, don't want to write yet another test file * Update tests * Update datamodule * Bug fix * Update test config * Fix plotting * More classes * Fix docs
microsoft · Aug 17, 2024 · 294a59b · 294a59b
1 parent 57a28a9
commit 294a59b
Show file tree

Hide file tree

Showing 30 changed files with 252 additions and 340 deletions.
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -47,8 +47,6 @@ Chesapeake Land Cover
 ^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: Chesapeake
-.. autoclass:: Chesapeake7
-.. autoclass:: Chesapeake13
 .. autoclass:: ChesapeakeDC
 .. autoclass:: ChesapeakeDE
 .. autoclass:: ChesapeakeMD

diff --git a/docs/api/geo_datasets.csv b/docs/api/geo_datasets.csv
@@ -4,7 +4,7 @@ Dataset,Type,Source,License,Size (px),Resolution (m)
 `Airphen`_,Imagery,Airphen,-,"1,280x960",0.047--0.09
 `Aster Global DEM`_,DEM,Aster,"public domain","3,601x3,601",30
 `Canadian Building Footprints`_,Geometries,Bing Imagery,"ODbL-1.0",-,-
-`Chesapeake Land Cover`_,"Imagery, Masks",NAIP,"CC-BY-4.0",-,1
+`Chesapeake Land Cover`_,"Imagery, Masks",NAIP,"CC0-1.0",-,1
 `Global Mangrove Distribution`_,Masks,"Remote Sensing, In Situ Measurements","public domain",-,3
 `Cropland Data Layer`_,Masks,Landsat,"public domain",-,30
 `EDDMapS`_,Points,Citizen Scientists,-,-,-

diff --git a/pyproject.toml b/pyproject.toml
@@ -99,8 +99,6 @@ datasets = [
     "scikit-image>=0.19",
     # scipy 1.7.2+ required for Python 3.10 wheels
     "scipy>=1.7.2",
-    # zipfile-deflate64 0.2+ required for Python 3.10 wheels
-    "zipfile-deflate64>=0.2",
 ]
 docs = [
     # ipywidgets 7+ required by nbsphinx

diff --git a/requirements/datasets.txt b/requirements/datasets.txt
@@ -8,4 +8,3 @@ radiant-mlhub==0.4.1
 rarfile==4.2
 scikit-image==0.24.0
 scipy==1.14.0
-zipfile-deflate64==0.2.0
diff --git a/requirements/min-reqs.old b/requirements/min-reqs.old
@@ -31,7 +31,6 @@ radiant-mlhub==0.3.0
 rarfile==4.0
 scikit-image==0.19.0
 scipy==1.7.2
-zipfile-deflate64==0.2.0
 
 # tests
 pytest==7.3.0

diff --git a/tests/conf/naipchesapeake.yaml b/tests/conf/naipchesapeake.yaml
@@ -3,9 +3,9 @@ model:
   init_args:
     loss: "ce"
     model: "deeplabv3+"
-    backbone: "resnet34"
+    backbone: "resnet18"
     in_channels: 4
-    num_classes: 14
+    num_classes: 128
     num_filters: 1
     ignore_index: null
 data:
@@ -15,4 +15,4 @@ data:
     patch_size: 32
   dict_kwargs:
     naip_paths: "tests/data/naip"
-    chesapeake_paths: "tests/data/chesapeake/BAYWIDE"
+    chesapeake_paths: "tests/data/chesapeake/lulc"
diff --git a/tests/data/chesapeake/BAYWIDE/Baywide_13Class_20132014.tif b/tests/data/chesapeake/BAYWIDE/Baywide_13Class_20132014.tif
diff --git a/tests/data/chesapeake/BAYWIDE/Baywide_13Class_20132014.zip b/tests/data/chesapeake/BAYWIDE/Baywide_13Class_20132014.zip
diff --git a/tests/data/chesapeake/BAYWIDE/data.py → tests/data/chesapeake/lulc/data.py b/tests/data/chesapeake/BAYWIDE/data.py → tests/data/chesapeake/lulc/data.py
@@ -4,16 +4,14 @@
 # Licensed under the MIT License.
 
 import hashlib
-import os
-import subprocess
+import shutil
 
 import numpy as np
 import rasterio
 from rasterio.crs import CRS
 from rasterio.transform import Affine
 
 SIZE = 128  # image width/height
-NUM_CLASSES = 14
 
 np.random.seed(0)
 
@@ -41,24 +39,50 @@
     AXIS["Easting",EAST],
     AXIS["Northing",NORTH]]
 """
-cmap = {
-    0: (0, 0, 0, 255),
-    1: (0, 197, 255, 255),
-    2: (0, 168, 132, 255),
-    3: (38, 115, 0, 255),
-    4: (76, 230, 0, 255),
-    5: (163, 255, 115, 255),
-    6: (255, 170, 0, 255),
-    7: (255, 0, 0, 255),
-    8: (156, 156, 156, 255),
-    9: (0, 0, 0, 255),
-    10: (115, 115, 0, 255),
-    11: (230, 230, 0, 255),
-    12: (255, 255, 115, 255),
-    13: (197, 0, 255, 255),
-}
 
 
+values = [
+    11,
+    12,
+    13,
+    14,
+    15,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    41,
+    42,
+    51,
+    52,
+    53,
+    54,
+    55,
+    56,
+    62,
+    63,
+    64,
+    65,
+    72,
+    73,
+    74,
+    75,
+    83,
+    84,
+    85,
+    91,
+    92,
+    93,
+    94,
+    95,
+    127,
+]
+
 meta = {
     'driver': 'GTiff',
     'dtype': 'uint8',
@@ -70,26 +94,18 @@
     'transform': Affine(1.0, 0.0, 1303555.0000000005, 0.0, -1.0, 2535064.999999998),
 }
 
-# Remove old data
-if os.path.exists(f'{filename}.tif'):
-    os.remove(f'{filename}.tif')
+for state in ['dc', 'de', 'md', 'ny', 'pa', 'va', 'wv']:
+    filename = f'{state}_lulc_2018_2022-Edition'
 
-# Create raster file
-with rasterio.open(f'{filename}.tif', 'w', **meta) as f:
-    data = np.random.randint(NUM_CLASSES, size=(SIZE, SIZE), dtype=np.uint8)
-    f.write(data, 1)
-    f.write_colormap(1, cmap)
+    # Create raster file
+    with rasterio.open(f'{filename}.tif', 'w', **meta) as f:
+        data = np.random.choice(values, size=(SIZE, SIZE))
+        f.write(data, 1)
 
-# Create zip file
-# 7z required to create a zip file using the proprietary DEFLATE64 compression algorithm
-# https://github.com/brianhelba/zipfile-deflate64/issues/19#issuecomment-1006077294
-subprocess.run(
-    ['7z', 'a', f'{filename}.zip', '-mm=DEFLATE64', f'{filename}.tif'],
-    capture_output=True,
-    check=True,
-)
+    # Compress file
+    shutil.make_archive(filename, 'zip', '.', filename + '.tif')
 
-# Compute checksums
-with open(f'{filename}.zip', 'rb') as f:
-    md5 = hashlib.md5(f.read()).hexdigest()
-    print(repr(md5))
+    # Compute checksums
+    with open(f'{filename}.zip', 'rb') as f:
+        md5 = hashlib.md5(f.read()).hexdigest()
+        print(state, repr(md5))
diff --git a/tests/data/chesapeake/lulc/dc_lulc_2018_2022-Edition.tif b/tests/data/chesapeake/lulc/dc_lulc_2018_2022-Edition.tif
diff --git a/tests/data/chesapeake/lulc/dc_lulc_2018_2022-Edition.zip b/tests/data/chesapeake/lulc/dc_lulc_2018_2022-Edition.zip
diff --git a/tests/data/chesapeake/lulc/de_lulc_2018_2022-Edition.tif b/tests/data/chesapeake/lulc/de_lulc_2018_2022-Edition.tif
diff --git a/tests/data/chesapeake/lulc/de_lulc_2018_2022-Edition.zip b/tests/data/chesapeake/lulc/de_lulc_2018_2022-Edition.zip
diff --git a/tests/data/chesapeake/lulc/md_lulc_2018_2022-Edition.tif b/tests/data/chesapeake/lulc/md_lulc_2018_2022-Edition.tif
diff --git a/tests/data/chesapeake/lulc/md_lulc_2018_2022-Edition.zip b/tests/data/chesapeake/lulc/md_lulc_2018_2022-Edition.zip
diff --git a/tests/data/chesapeake/lulc/ny_lulc_2018_2022-Edition.tif b/tests/data/chesapeake/lulc/ny_lulc_2018_2022-Edition.tif
diff --git a/tests/data/chesapeake/lulc/ny_lulc_2018_2022-Edition.zip b/tests/data/chesapeake/lulc/ny_lulc_2018_2022-Edition.zip
diff --git a/tests/data/chesapeake/lulc/pa_lulc_2018_2022-Edition.tif b/tests/data/chesapeake/lulc/pa_lulc_2018_2022-Edition.tif
diff --git a/tests/data/chesapeake/lulc/pa_lulc_2018_2022-Edition.zip b/tests/data/chesapeake/lulc/pa_lulc_2018_2022-Edition.zip
diff --git a/tests/data/chesapeake/lulc/va_lulc_2018_2022-Edition.tif b/tests/data/chesapeake/lulc/va_lulc_2018_2022-Edition.tif
diff --git a/tests/data/chesapeake/lulc/va_lulc_2018_2022-Edition.zip b/tests/data/chesapeake/lulc/va_lulc_2018_2022-Edition.zip
diff --git a/tests/data/chesapeake/lulc/wv_lulc_2018_2022-Edition.tif b/tests/data/chesapeake/lulc/wv_lulc_2018_2022-Edition.tif
diff --git a/tests/data/chesapeake/lulc/wv_lulc_2018_2022-Edition.zip b/tests/data/chesapeake/lulc/wv_lulc_2018_2022-Edition.zip
diff --git a/tests/datasets/test_chesapeake.py b/tests/datasets/test_chesapeake.py
@@ -16,85 +16,79 @@
 import torchgeo.datasets.utils
 from torchgeo.datasets import (
     BoundingBox,
-    Chesapeake13,
     ChesapeakeCVPR,
+    ChesapeakeDC,
     DatasetNotFoundError,
     IntersectionDataset,
     UnionDataset,
 )
 
-pytest.importorskip('zipfile_deflate64')
-
 
 def download_url(url: str, root: str | Path, *args: str, **kwargs: str) -> None:
     shutil.copy(url, root)
 
 
-class TestChesapeake13:
+class TestChesapeakeDC:
     @pytest.fixture
-    def dataset(self, monkeypatch: MonkeyPatch, tmp_path: Path) -> Chesapeake13:
+    def dataset(self, monkeypatch: MonkeyPatch, tmp_path: Path) -> ChesapeakeDC:
         monkeypatch.setattr(torchgeo.datasets.chesapeake, 'download_url', download_url)
-        md5 = 'fe35a615b8e749b21270472aa98bb42c'
-        monkeypatch.setattr(Chesapeake13, 'md5', md5)
         url = os.path.join(
-            'tests', 'data', 'chesapeake', 'BAYWIDE', 'Baywide_13Class_20132014.zip'
+            'tests',
+            'data',
+            'chesapeake',
+            'lulc',
+            '{state}_lulc_{year}_2022-Edition.zip',
         )
-        monkeypatch.setattr(Chesapeake13, 'url', url)
+        monkeypatch.setattr(ChesapeakeDC, 'url', url)
+        md5s = {2018: '35c644f13ccdb1baf62adf85cb8c7e48'}
+        monkeypatch.setattr(ChesapeakeDC, 'md5s', md5s)
         monkeypatch.setattr(plt, 'show', lambda *args: None)
-        root = tmp_path
         transforms = nn.Identity()
-        return Chesapeake13(root, transforms=transforms, download=True, checksum=True)
+        return ChesapeakeDC(
+            tmp_path, transforms=transforms, download=True, checksum=True
+        )
 
-    def test_getitem(self, dataset: Chesapeake13) -> None:
+    def test_getitem(self, dataset: ChesapeakeDC) -> None:
         x = dataset[dataset.bounds]
         assert isinstance(x, dict)
         assert isinstance(x['crs'], CRS)
         assert isinstance(x['mask'], torch.Tensor)
 
-    def test_len(self, dataset: Chesapeake13) -> None:
+    def test_len(self, dataset: ChesapeakeDC) -> None:
         assert len(dataset) == 1
 
-    def test_and(self, dataset: Chesapeake13) -> None:
+    def test_and(self, dataset: ChesapeakeDC) -> None:
         ds = dataset & dataset
         assert isinstance(ds, IntersectionDataset)
 
-    def test_or(self, dataset: Chesapeake13) -> None:
+    def test_or(self, dataset: ChesapeakeDC) -> None:
         ds = dataset | dataset
         assert isinstance(ds, UnionDataset)
 
-    def test_already_extracted(self, dataset: Chesapeake13) -> None:
-        Chesapeake13(dataset.paths, download=True)
+    def test_already_extracted(self, dataset: ChesapeakeDC) -> None:
+        ChesapeakeDC(dataset.paths, download=True)
 
     def test_already_downloaded(self, tmp_path: Path) -> None:
         url = os.path.join(
-            'tests', 'data', 'chesapeake', 'BAYWIDE', 'Baywide_13Class_20132014.zip'
+            'tests', 'data', 'chesapeake', 'lulc', 'dc_lulc_2018_2022-Edition.zip'
         )
-        root = tmp_path
-        shutil.copy(url, root)
-        Chesapeake13(root)
+        shutil.copy(url, tmp_path)
+        ChesapeakeDC(tmp_path)
 
     def test_not_downloaded(self, tmp_path: Path) -> None:
         with pytest.raises(DatasetNotFoundError, match='Dataset not found'):
-            Chesapeake13(tmp_path, checksum=True)
+            ChesapeakeDC(tmp_path, checksum=True)
 
-    def test_plot(self, dataset: Chesapeake13) -> None:
+    def test_plot(self, dataset: ChesapeakeDC) -> None:
         query = dataset.bounds
         x = dataset[query]
         dataset.plot(x, suptitle='Test')
         plt.close()
-
-    def test_plot_prediction(self, dataset: Chesapeake13) -> None:
-        query = dataset.bounds
-        x = dataset[query]
         x['prediction'] = x['mask'].clone()
         dataset.plot(x, suptitle='Prediction')
         plt.close()
 
-    def test_url(self) -> None:
-        ds = Chesapeake13(os.path.join('tests', 'data', 'chesapeake', 'BAYWIDE'))
-        assert 'cicwebresources.blob.core.windows.net' in ds.url
-
-    def test_invalid_query(self, dataset: Chesapeake13) -> None:
+    def test_invalid_query(self, dataset: ChesapeakeDC) -> None:
         query = BoundingBox(0, 0, 0, 0, 0, 0)
         with pytest.raises(
             IndexError, match='query: .* not found in index with bounds:'

diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py
@@ -76,15 +76,12 @@ def download_url(url: str, root: str | Path, *args: str) -> None:
         os.path.join('cowc_detection', 'COWC_test_list_detection.txt.bz2'),
         os.path.join('vhr10', 'NWPU VHR-10 dataset.rar'),
         os.path.join('landcoverai', 'landcover.ai.v1.zip'),
-        os.path.join('chesapeake', 'BAYWIDE', 'Baywide_13Class_20132014.zip'),
         os.path.join('sen12ms', 'ROIs1158_spring_lc.tar.gz'),
     ],
 )
 def test_extract_archive(src: str, tmp_path: Path) -> None:
     if src.endswith('.rar'):
         pytest.importorskip('rarfile', minversion='4')
-    if src.startswith('chesapeake'):
-        pytest.importorskip('zipfile_deflate64')
     extract_archive(os.path.join('tests', 'data', src), tmp_path)
 
 

diff --git a/tests/trainers/test_segmentation.py b/tests/trainers/test_segmentation.py
@@ -89,8 +89,6 @@ def test_trainer(
                     'ecec8e871faf1bbd8ca525ca95ddc1c1f5213f40afb94599884bd85f990ebd6b'
                 )
                 monkeypatch.setattr(LandCoverAI, 'sha256', sha256)
-            case 'naipchesapeake':
-                pytest.importorskip('zipfile_deflate64')
 
         config = os.path.join('tests', 'conf', name + '.yaml')
 

diff --git a/torchgeo/datamodules/naip.py b/torchgeo/datamodules/naip.py
@@ -8,7 +8,17 @@
 import kornia.augmentation as K
 from matplotlib.figure import Figure
 
-from ..datasets import NAIP, BoundingBox, Chesapeake13
+from ..datasets import (
+    NAIP,
+    BoundingBox,
+    ChesapeakeDC,
+    ChesapeakeDE,
+    ChesapeakeMD,
+    ChesapeakeNY,
+    ChesapeakePA,
+    ChesapeakeVA,
+    ChesapeakeWV,
+)
 from ..samplers import GridGeoSampler, RandomBatchGeoSampler
 from ..transforms import AugmentationSequential
 from .geo import GeoDataModule
@@ -37,7 +47,7 @@ def __init__(
             num_workers: Number of workers for parallel data loading.
             **kwargs: Additional keyword arguments passed to
                 :class:`~torchgeo.datasets.NAIP` (prefix keys with ``naip_``) and
-                :class:`~torchgeo.datasets.Chesapeake13`
+                :class:`~torchgeo.datasets.Chesapeake`
                 (prefix keys with ``chesapeake_``).
         """
         self.naip_kwargs = {}
@@ -49,12 +59,7 @@ def __init__(
                 self.chesapeake_kwargs[key[11:]] = val
 
         super().__init__(
-            Chesapeake13,
-            batch_size,
-            patch_size,
-            length,
-            num_workers,
-            **self.chesapeake_kwargs,
+            NAIP, batch_size, patch_size, length, num_workers, **self.naip_kwargs
         )
 
         self.aug = AugmentationSequential(
@@ -67,9 +72,16 @@ def setup(self, stage: str) -> None:
         Args:
             stage: Either 'fit', 'validate', 'test', or 'predict'.
         """
-        self.chesapeake = Chesapeake13(**self.chesapeake_kwargs)
         self.naip = NAIP(**self.naip_kwargs)
-        self.dataset = self.chesapeake & self.naip
+        dc = ChesapeakeDC(**self.chesapeake_kwargs)
+        de = ChesapeakeDE(**self.chesapeake_kwargs)
+        md = ChesapeakeMD(**self.chesapeake_kwargs)
+        ny = ChesapeakeNY(**self.chesapeake_kwargs)
+        pa = ChesapeakePA(**self.chesapeake_kwargs)
+        va = ChesapeakeVA(**self.chesapeake_kwargs)
+        wv = ChesapeakeWV(**self.chesapeake_kwargs)
+        self.chesapeake = dc | de | md | ny | pa | va | wv
+        self.dataset = self.naip & self.chesapeake
 
         roi = self.dataset.bounds
         midx = roi.minx + (roi.maxx - roi.minx) / 2

diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
@@ -16,8 +16,6 @@
 from .chabud import ChaBuD
 from .chesapeake import (
     Chesapeake,
-    Chesapeake7,
-    Chesapeake13,
     ChesapeakeCVPR,
     ChesapeakeDC,
     ChesapeakeDE,