Fix gem logic, reverse cropping/transformation order. (facebookresearch#379)

iseessel · facebook-github-bot · commit e2fece3753d6 · 2021-08-09T14:59:56.000-07:00
Summary: Pull Request resolved: facebookresearch#379 1. Fix the gem post processing logic. Before this change, the code assumes that each non-preprocessed feature tensor has the same tensor shape: ``` if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem": gem_out_fname = f"{out_dir}/{train_dataset_name}_GeM.npy" train_features = torch.tensor(np.concatenate(train_features)) ``` This is not the case, since ROxford/RParis images do not have a standard size, hence the resx layers have different height and widths (but same number of channels). GeM pooling will transform an image of any shape to a shape of `(num_channels)` The change performs gem_pooling on each individual images, as opposed to all the images at once. This should be fine because both gem and l2 normalization are to be performed per-image. 2. Transform before cropping to the bounding box (as opposed to after cropping). The experiments show that this yields much better results. This is also what the deepcluster implentation uses: https://github.com/facebookresearch/deepcluster/blob/master/eval_retrieval.py#L44 ``` Oxford: 61.57 / 41.74 / 14.33 vs. 69.65 / 48.51 / 16.41 Paris: 83.7 / 66.87 / 44.81 vs. 87.9 / 70.57 / 47.39 ``` f288434289 f288438150 Differential Revision: D29993204 fbshipit-source-id: 6eb48e00011704c6f670f60417e2ed53a9ff0cb9
diff --git a/tools/instance_retrieval_test.py b/tools/instance_retrieval_test.py
@@ -80,20 +80,6 @@ def build_retrieval_model(cfg):
     return model
 
 
-def gem_pool_and_save_features(features, p, add_bias, gem_out_fname):
-    if gem_out_fname and PathManager.exists(gem_out_fname):
-        logging.info("Loading train GeM features...")
-        features = load_file(gem_out_fname)
-    else:
-        logging.info(f"GeM pooling features: {features.shape}")
-        features = l2n(gem(features, p=p, add_bias=True))
-
-        if gem_out_fname:
-            save_file(features, gem_out_fname, verbose=False)
-            logging.info(f"Saved GeM features to: {gem_out_fname}")
-    return features
-
-
 def get_train_features(
     cfg,
     temp_dir,
@@ -120,7 +106,7 @@ def process_train_image(i, out_dir):
         else:
             fname_in = train_dataset.get_filename(i)
             if is_revisited_dataset(train_dataset_name):
-                img = image_helper.load_and_prepare_revisited_image(fname_in)
+                img = image_helper.load_and_prepare_revisited_image(fname_in, roi=None)
             elif is_whiten_dataset(train_dataset_name):
                 img = image_helper.load_and_prepare_whitening_image(fname_in)
             else:
@@ -136,6 +122,14 @@ def process_train_image(i, out_dir):
             elif cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "l2_norm":
                 # we simply L2 normalize the features otherwise
                 descriptors = F.normalize(activation_map, p=2, dim=0)
+            elif cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem":
+                descriptors = l2n(
+                    gem(
+                        activation_map,
+                        p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER,
+                        add_bias=False,
+                    )
+                )
             else:
                 descriptors = activation_map
 
@@ -154,19 +148,6 @@ def process_train_image(i, out_dir):
     for i in range(num_images):
         process_train_image(i, out_dir)
 
-    if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem":
-
-        gem_out_fname = None
-        if out_dir:
-            gem_out_fname = f"{out_dir}/{train_dataset_name}_GeM.npy"
-
-        train_features = torch.tensor(np.concatenate(train_features))
-        train_features = gem_pool_and_save_features(
-            train_features,
-            p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER,
-            add_bias=True,
-            gem_out_fname=gem_out_fname,
-        )
     train_features = np.vstack([x.reshape(-1, x.shape[-1]) for x in train_features])
     logging.info(f"Train features size: {train_features.shape}")
     return train_features
@@ -201,6 +182,21 @@ def process_eval_image(
     elif cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "l2_norm":
         # we simply L2 normalize the features otherwise
         descriptors = F.normalize(activation_map, p=2, dim=0)
+        # Optionally apply pca.
+        if pca:
+            descriptors = pca.apply(descriptors)
+
+    elif cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem":
+        descriptors = l2n(
+            gem(
+                activation_map,
+                p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER,
+                add_bias=True,
+            )
+        )
+        # Optionally apply pca.
+        if pca:
+            descriptors = pca.apply(descriptors)
     else:
         descriptors = activation_map
 
@@ -255,19 +251,6 @@ def get_dataset_features(
             )
         features_dataset.append(db_feature)
 
-    if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem":
-        # GeM pool the features and apply the PCA
-        gem_out_fname = None
-        if db_fname_out_dir:
-            gem_out_fname = f"{db_fname_out_dir}/{eval_dataset_name}_GeM.npy"
-        features_dataset = torch.tensor(np.concatenate(features_dataset))
-        features_dataset = gem_pool_and_save_features(
-            features_dataset,
-            p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER,
-            add_bias=True,
-            gem_out_fname=gem_out_fname,
-        )
-        features_dataset = pca.apply(features_dataset)
     features_dataset = np.vstack(features_dataset)
     logging.info(f"Dataset Features Size: {features_dataset.shape}")
     return features_dataset
@@ -325,19 +308,6 @@ def get_queries_features(
             )
         features_queries.append(query_feature)
 
-    if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "gem":
-        # GeM pool the features and apply the PCA
-        gem_out_fname = None
-        if q_fname_out_dir:
-            gem_out_fname = f"{q_fname_out_dir}/{eval_dataset_name}_GeM.npy"
-        features_queries = torch.tensor(np.concatenate(features_queries))
-        features_queries = gem_pool_and_save_features(
-            features_queries,
-            p=cfg.IMG_RETRIEVAL.GEM_POOL_POWER,
-            add_bias=True,
-            gem_out_fname=gem_out_fname,
-        )
-        features_queries = pca.apply(features_queries)
     features_queries = np.vstack(features_queries)
     logging.info(f"Queries Features Size: {features_queries.shape}")
     return features_queries
diff --git a/vissl/utils/instance_retrieval_utils/data_util.py b/vissl/utils/instance_retrieval_utils/data_util.py
@@ -485,12 +485,13 @@ def load_and_prepare_image(self, fname, roi=None):
         """
         # Read image, get aspect ratio, and resize such as the largest side equals S
         with PathManager.open(fname, "rb") as f:
-            im = Image.open(f).convert(mode="RGB")
-        im_resized, ratio = self.apply_img_transform(im)
+            img = Image.open(f).convert(mode="RGB")
+        im_resized, ratio = self.apply_img_transform(img)
         # If there is a roi, adapt the roi to the new size and crop. Do not rescale
         # the image once again
         if roi is not None:
             # ROI format is (xmin,ymin,xmax,ymax)
+            roi = np.array(roi)
             roi = np.round(roi * ratio).astype(np.int32)
             im_resized = im_resized[:, roi[1] : roi[3], roi[0] : roi[2]]
         return im_resized
@@ -507,10 +508,14 @@ def load_and_prepare_revisited_image(self, img_path, roi=None):
         with PathManager.open(img_path, "rb") as f:
             img = Image.open(f).convert("RGB")
 
+        im_resized, ratio = self.apply_img_transform(img)
+        # If there is a roi, adapt the roi to the new size and crop. Do not rescale
+        # the image once again
         if roi is not None:
-            img = img.crop(roi)
-
-        im_resized, _ = self.apply_img_transform(img)
+            # ROI format is (xmin,ymin,xmax,ymax)
+            roi = np.array(roi)
+            roi = np.round(roi * ratio).astype(np.int32)
+            im_resized = im_resized[:, roi[1] : roi[3], roi[0] : roi[2]]
         return im_resized