Skip to content
This repository was archived by the owner on Mar 19, 2024. It is now read-only.

Add resisc45 and update eurosat datasets #464

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# @package _global_
config:
VERBOSE: True
LOG_FREQUENCY: 200
TEST_ONLY: False
TEST_EVERY_NUM_EPOCH: 1
TEST_MODEL: True
SEED_VALUE: 1
MULTI_PROCESSING_METHOD: forkserver
HOOKS:
PERF_STATS:
MONITOR_PERF_STATS: True
DATA:
NUM_DATALOADER_WORKERS: 5
TRAIN:
DATA_SOURCES: [disk_folder]
LABEL_SOURCES: [disk_folder]
DATASET_NAMES: [resisc45_folder]
BATCHSIZE_PER_REPLICA: 32
TRANSFORMS:
- name: RandomResizedCrop
size: 224
- name: RandomHorizontalFlip
- name: ToTensor
- name: Normalize
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
MMAP_MODE: True
COPY_TO_LOCAL_DISK: False
COPY_DESTINATION_DIR: /tmp/resisc_45_/
TEST:
DATA_SOURCES: [disk_folder]
LABEL_SOURCES: [disk_folder]
DATASET_NAMES: [resisc45_folder]
BATCHSIZE_PER_REPLICA: 32
TRANSFORMS:
- name: Resize
size: 256
- name: CenterCrop
size: 224
- name: ToTensor
- name: Normalize
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
MMAP_MODE: True
COPY_TO_LOCAL_DISK: False
COPY_DESTINATION_DIR: /tmp/resisc45/
METERS:
name: accuracy_list_meter
accuracy_list_meter:
num_meters: 1
topk_values: [1]
TRAINER:
TRAIN_STEP_NAME: standard_train_step
MODEL:
FEATURE_EVAL_SETTINGS:
EVAL_MODE_ON: True
FREEZE_TRUNK_ONLY: True
SHOULD_FLATTEN_FEATS: False
LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
["res5", ["AdaptiveAvgPool2d", [[1, 1]]]],
]
TRUNK:
NAME: resnet
RESNETS:
DEPTH: 50
HEAD:
PARAMS: [
["eval_mlp", {"in_channels": 2048, "dims": [2048, 37]}],
]
WEIGHTS_INIT:
PARAMS_FILE: "specify the model weights"
STATE_DICT_KEY_NAME: classy_state_dict
# STATE_DICT_KEY_NAME: model_state_dict
SYNC_BN_CONFIG:
CONVERT_BN_TO_SYNC_BN: True
SYNC_BN_TYPE: apex
GROUP_SIZE: 8
LOSS:
name: cross_entropy_multiple_output_single_target
cross_entropy_multiple_output_single_target:
ignore_index: -1
OPTIMIZER:
name: sgd
# In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4
weight_decay: 0.0005
momentum: 0.9
num_epochs: 28
nesterov: True
regularize_bn: False
regularize_bias: True
param_schedulers:
lr:
auto_lr_scaling:
auto_scale: true
base_value: 0.01
base_lr_batch_size: 256
name: multistep
values: [0.01, 0.001, 0.0001, 0.00001]
milestones: [8, 16, 24]
update_interval: epoch
DISTRIBUTED:
BACKEND: nccl
NUM_NODES: 1
NUM_PROC_PER_NODE: 8
INIT_METHOD: tcp
RUN_ID: auto
MACHINE:
DEVICE: gpu
CHECKPOINT:
DIR: "."
AUTO_RESUME: True
CHECKPOINT_FREQUENCY: 1
3 changes: 2 additions & 1 deletion configs/config/dataset_catalog.json
Original file line number Diff line number Diff line change
Expand Up @@ -162,5 +162,6 @@
"google-imagenet1k-per10": {
"train": ["<img_path>", "<lbl_path>"],
"val": ["<img_path>", "<lbl_path>"]
}
},
"resisc45_folder": {}
}
3 changes: 3 additions & 0 deletions extra_scripts/datasets/create_euro_sat_data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import argparse
import os
import random
import shutil

from torch.utils.data import DataLoader
Expand Down Expand Up @@ -62,11 +63,13 @@ def __init__(self, input_path: str, output_path: str, train: bool):
self.images = []
self.targets = []
self.labels = sorted(os.listdir(self.image_folder))
split_generator = random.Random(42)

# There is no train/val split in the EUROSAT dataset, so we have to create it
for i, label in enumerate(self.labels):
label_path = os.path.join(self.image_folder, label)
files = sorted(os.listdir(label_path))
files = split_generator.sample(files, self.TRAIN_SAMPLES + self.VALID_SAMPLES)
if train:
self.images.extend(files[: self.TRAIN_SAMPLES])
self.targets.extend([i] * self.TRAIN_SAMPLES)
Expand Down
129 changes: 129 additions & 0 deletions extra_scripts/datasets/create_resisc45_data_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (c) Facebook, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import argparse
import os
import random
import shutil

from torch.utils.data import DataLoader
from tqdm import tqdm


RESISC45_URL = "https://1drv.ms/u/s!AmgKYzARBl5ca3HNaHIlzp_IXjs"


def get_argument_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--input",
type=str,
help="Path to the expanded NWPU-RESISC45.rar archive (download from: {})".format(RESISC45_URL),
)
parser.add_argument(
"-o",
"--output",
type=str,
help="Folder where the classification dataset will be written",
)
parser.add_argument(
"-d",
"--download",
action="store_const",
const=True,
default=False,
help="To download the original dataset and decompress it in the input folder",
)
return parser


class _RESISC45:
"""
Dataset used to parallelize the transformation of the dataset via a DataLoader
"""

TRAIN_SPLIT_PERCENT = .8
TEST_SPLIT_PERCENT = .2

def __init__(self, input_path: str, output_path: str, train: bool):
self.input_path = input_path
self.output_path = output_path
self.train = train
self.images = []
self.targets = []
self.labels = sorted(os.listdir(self.input_path))
split_generator = random.Random(42)

# There is no train/val split in the RESISC45 dataset, so we have to create it
for i, label in enumerate(self.labels):
label_path = os.path.join(self.input_path, label)
files = sorted(os.listdir(label_path))
files = split_generator.shuffle(files)
train_samples = int(self.TRAIN_SPLIT_PERCENT * len(files))
test_samples = int(self.TEST_SPLIT_PERCENT * len(files))
if train:
self.images.extend(files[: train_samples])
self.targets.extend([i] * train_samples)
else:
self.images.extend(
files[train_samples: train_samples + test_samples]
)
self.targets.extend([i] * test_samples)

def __len__(self):
return len(self.targets)

def __getitem__(self, idx: int) -> bool:
image_name = self.images[idx]
target = self.labels[self.targets[idx]]
image_path = os.path.join(self.input_path, target, image_name)
split_name = "train" if self.train else "test"
shutil.copy(
image_path, os.path.join(self.output_path, split_name, target, image_name)
)
return True


def create_disk_folder_split(dataset: _RESISC45, split_path: str):
"""
Create one split (example: "train" or "test") of the disk_folder hierarchy
"""
for label in dataset.labels:
os.makedirs(os.path.join(split_path, label), exist_ok=True)
loader = DataLoader(dataset, num_workers=8, batch_size=1, collate_fn=lambda x: x[0])
with tqdm(total=len(dataset)) as progress_bar:
for _ in loader:
progress_bar.update(1)


def create_resisc_disk_folder(input_path: str, output_path: str):
"""
Read the RESISC45 dataset at 'input_path' and transform it to a disk folder at 'output_path'
"""
print("Creating the training split...")
create_disk_folder_split(
dataset=_RESISC45(input_path, output_path=output_path, train=True),
split_path=os.path.join(output_path, "train"),
)
print("Creating the validation split...")
create_disk_folder_split(
dataset=_RESISC45(input_path, output_path=output_path, train=False),
split_path=os.path.join(output_path, "test"),
)


if __name__ == "__main__":
"""
Example usage:

```
python extra_scripts/datasets/create_resisc45_data_files.py -i /path/to/resisc45 -o /output_path/to/resisc45
```
"""
args = get_argument_parser().parse_args()
if args.download:
raise Exception("Cannot automatically download RESISC45. You can manually download the archive at {}".format(RESISC45_URL))
create_resisc_disk_folder(args.input, args.output)