facebookresearch · normster · Nov 1, 2021 · Nov 4, 2021
diff --git a/...hmark/linear_image_classification/resisc45/eval_resnet_8gpu_transfer_resisc45_linear.yaml b/...hmark/linear_image_classification/resisc45/eval_resnet_8gpu_transfer_resisc45_linear.yaml
@@ -0,0 +1,113 @@
+# @package _global_
+config:
+  VERBOSE: True
+  LOG_FREQUENCY: 200
+  TEST_ONLY: False
+  TEST_EVERY_NUM_EPOCH: 1
+  TEST_MODEL: True
+  SEED_VALUE: 1
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+  DATA:
+    NUM_DATALOADER_WORKERS: 5
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [resisc45_folder]
+      BATCHSIZE_PER_REPLICA: 32
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 224
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/resisc_45_/
+    TEST:
+      DATA_SOURCES: [disk_folder]
+      LABEL_SOURCES: [disk_folder]
+      DATASET_NAMES: [resisc45_folder]
+      BATCHSIZE_PER_REPLICA: 32
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/resisc45/
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1]
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  MODEL:
+    FEATURE_EVAL_SETTINGS:
+      EVAL_MODE_ON: True
+      FREEZE_TRUNK_ONLY: True
+      SHOULD_FLATTEN_FEATS: False
+      LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+        ["res5", ["AdaptiveAvgPool2d", [[1, 1]]]],
+      ]
+    TRUNK:
+      NAME: resnet
+      RESNETS:
+        DEPTH: 50
+    HEAD:
+      PARAMS: [
+        ["eval_mlp", {"in_channels": 2048, "dims": [2048, 37]}],
+      ]
+    WEIGHTS_INIT:
+      PARAMS_FILE: "specify the model weights"
+      STATE_DICT_KEY_NAME: classy_state_dict
+      # STATE_DICT_KEY_NAME: model_state_dict
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: True
+      SYNC_BN_TYPE: apex
+      GROUP_SIZE: 8
+  LOSS:
+    name: cross_entropy_multiple_output_single_target
+    cross_entropy_multiple_output_single_target:
+      ignore_index: -1
+  OPTIMIZER:
+      name: sgd
+      # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4
+      weight_decay: 0.0005
+      momentum: 0.9
+      num_epochs: 28
+      nesterov: True
+      regularize_bn: False
+      regularize_bias: True
+      param_schedulers:
+        lr:
+          auto_lr_scaling:
+            auto_scale: true
+            base_value: 0.01
+            base_lr_batch_size: 256
+          name: multistep
+          values: [0.01, 0.001, 0.0001, 0.00001]
+          milestones: [8, 16, 24]
+          update_interval: epoch
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 1
+    NUM_PROC_PER_NODE: 8
+    INIT_METHOD: tcp
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 1
diff --git a/configs/config/dataset_catalog.json b/configs/config/dataset_catalog.json
@@ -162,5 +162,6 @@
     "google-imagenet1k-per10": {
         "train": ["<img_path>", "<lbl_path>"],
         "val": ["<img_path>", "<lbl_path>"]
-    }
+    },
+    "resisc45_folder": {}
 }
diff --git a/extra_scripts/datasets/create_euro_sat_data_files.py b/extra_scripts/datasets/create_euro_sat_data_files.py
@@ -5,6 +5,7 @@
 
 import argparse
 import os
+import random
 import shutil
 
 from torch.utils.data import DataLoader
@@ -62,11 +63,13 @@ def __init__(self, input_path: str, output_path: str, train: bool):
         self.images = []
         self.targets = []
         self.labels = sorted(os.listdir(self.image_folder))
+        split_generator = random.Random(42)
 
         # There is no train/val split in the EUROSAT dataset, so we have to create it
         for i, label in enumerate(self.labels):
             label_path = os.path.join(self.image_folder, label)
             files = sorted(os.listdir(label_path))
+            files = split_generator.sample(files, self.TRAIN_SAMPLES + self.VALID_SAMPLES)
             if train:
                 self.images.extend(files[: self.TRAIN_SAMPLES])
                 self.targets.extend([i] * self.TRAIN_SAMPLES)

diff --git a/extra_scripts/datasets/create_resisc45_data_files.py b/extra_scripts/datasets/create_resisc45_data_files.py
@@ -0,0 +1,129 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import random
+import shutil
+
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+
+RESISC45_URL = "https://1drv.ms/u/s!AmgKYzARBl5ca3HNaHIlzp_IXjs"
+
+
+def get_argument_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i",
+        "--input",
+        type=str,
+        help="Path to the expanded NWPU-RESISC45.rar archive (download from: {})".format(RESISC45_URL),
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        help="Folder where the classification dataset will be written",
+    )
+    parser.add_argument(
+        "-d",
+        "--download",
+        action="store_const",
+        const=True,
+        default=False,
+        help="To download the original dataset and decompress it in the input folder",
+    )
+    return parser
+
+
+class _RESISC45:
+    """
+    Dataset used to parallelize the transformation of the dataset via a DataLoader
+    """
+
+    TRAIN_SPLIT_PERCENT = .8
+    TEST_SPLIT_PERCENT = .2
+
+    def __init__(self, input_path: str, output_path: str, train: bool):
+        self.input_path = input_path
+        self.output_path = output_path
+        self.train = train
+        self.images = []
+        self.targets = []
+        self.labels = sorted(os.listdir(self.input_path))
+        split_generator = random.Random(42)
+
+        # There is no train/val split in the RESISC45 dataset, so we have to create it
+        for i, label in enumerate(self.labels):
+            label_path = os.path.join(self.input_path, label)
+            files = sorted(os.listdir(label_path))
+            files = split_generator.shuffle(files)
+            train_samples = int(self.TRAIN_SPLIT_PERCENT * len(files))
+            test_samples = int(self.TEST_SPLIT_PERCENT * len(files))
+            if train:
+                self.images.extend(files[: train_samples])
+                self.targets.extend([i] * train_samples)
+            else:
+                self.images.extend(
+                    files[train_samples: train_samples + test_samples]
+                )
+                self.targets.extend([i] * test_samples)
+
+    def __len__(self):
+        return len(self.targets)
+
+    def __getitem__(self, idx: int) -> bool:
+        image_name = self.images[idx]
+        target = self.labels[self.targets[idx]]
+        image_path = os.path.join(self.input_path, target, image_name)
+        split_name = "train" if self.train else "test"
+        shutil.copy(
+            image_path, os.path.join(self.output_path, split_name, target, image_name)
+        )
+        return True
+
+
+def create_disk_folder_split(dataset: _RESISC45, split_path: str):
+    """
+    Create one split (example: "train" or "test") of the disk_folder hierarchy
+    """
+    for label in dataset.labels:
+        os.makedirs(os.path.join(split_path, label), exist_ok=True)
+    loader = DataLoader(dataset, num_workers=8, batch_size=1, collate_fn=lambda x: x[0])
+    with tqdm(total=len(dataset)) as progress_bar:
+        for _ in loader:
+            progress_bar.update(1)
+
+
+def create_resisc_disk_folder(input_path: str, output_path: str):
+    """
+    Read the RESISC45 dataset at 'input_path' and transform it to a disk folder at 'output_path'
+    """
+    print("Creating the training split...")
+    create_disk_folder_split(
+        dataset=_RESISC45(input_path, output_path=output_path, train=True),
+        split_path=os.path.join(output_path, "train"),
+    )
+    print("Creating the validation split...")
+    create_disk_folder_split(
+        dataset=_RESISC45(input_path, output_path=output_path, train=False),
+        split_path=os.path.join(output_path, "test"),
+    )
+
+
+if __name__ == "__main__":
+    """
+    Example usage:
+
+    ```
+    python extra_scripts/datasets/create_resisc45_data_files.py -i /path/to/resisc45 -o /output_path/to/resisc45
+    ```
+    """
+    args = get_argument_parser().parse_args()
+    if args.download:
+        raise Exception("Cannot automatically download RESISC45. You can manually download the archive at {}".format(RESISC45_URL))
+    create_resisc_disk_folder(args.input, args.output)