⚡️ Speed up method `PositionEmbeddingLearned.forward` by 30% in PR #1250 (`feature/inference-v1-models`) #1274

codeflash-ai · 2025-05-14T12:23:07Z

⚡️ This pull request contains optimizations for PR #1250

If you approve this dependent PR, these changes will be merged into the original PR branch feature/inference-v1-models.

This PR will be automatically closed if the original PR is merged.

📄 30% (0.30x) speedup for `PositionEmbeddingLearned.forward` in `inference/v1/models/rfdetr/position_encoding.py`

⏱️ Runtime : 5.43 milliseconds → 4.17 milliseconds (best of 38 runs)

📝 Explanation and details

Optimization summary:

Uses .expand() instead of .repeat() to minimize memory usage and runtime by avoiding actual data copying and repeated allocation.
Only unsqueezes and expands once per axis rather than inserting into a list and doing repeat.
Precomputes device to avoid repeated attribute lookups.
No functional changes, all return values exactly match original behavior. All comments unchanged unless reflecting code optimization for .expand().

✅ Correctness verification report:

Test	Status
⚙️ Existing Unit Tests	🔘 None Found
🌀 Generated Regression Tests	✅ 84 Passed
⏪ Replay Tests	🔘 None Found
🔎 Concolic Coverage Tests	🔘 None Found
📊 Tests Coverage

🌀 Generated Regression Tests Details

import pytest  # used for our unit tests
import torch
from inference.v1.models.rfdetr.position_encoding import \
    PositionEmbeddingLearned
from torch import nn


# Dummy NestedTensor class for testing
class NestedTensor:
    def __init__(self, tensors):
        self.tensors = tensors
from inference.v1.models.rfdetr.position_encoding import \
    PositionEmbeddingLearned

# unit tests

# --------------------- BASIC TEST CASES ---------------------

def test_forward_output_shape_basic_square():
    """
    Test that the output shape is correct for a basic square input.
    """
    pe = PositionEmbeddingLearned(num_pos_feats=32)
    x = torch.zeros(10, 10, 2)  # H=10, W=10, bs=2
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_output_shape_basic_rectangular():
    """
    Test that the output shape is correct for a basic rectangular input.
    """
    pe = PositionEmbeddingLearned(num_pos_feats=8)
    x = torch.zeros(5, 7, 3)  # H=5, W=7, bs=3
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_output_values_repeatability():
    """
    Test that the output is deterministic for the same input and parameters.
    """
    pe = PositionEmbeddingLearned(num_pos_feats=4)
    torch.manual_seed(42)
    pe.reset_parameters()
    x = torch.zeros(3, 4, 1)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos1 = codeflash_output
    torch.manual_seed(42)
    pe.reset_parameters()
    codeflash_output = pe.forward(nt); pos2 = codeflash_output

def test_forward_gradients():
    """
    Test that the output is differentiable (requires_grad propagates).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(2, 2, 1, requires_grad=True)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output
    loss = pos.sum()
    loss.backward()

# --------------------- EDGE TEST CASES ---------------------

def test_forward_minimal_input():
    """
    Test minimal valid input (H=1, W=1, bs=1).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=1)
    x = torch.zeros(1, 1, 1)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output
    # Should not raise any errors

def test_forward_single_row():
    """
    Test input with a single row (H=1, W=5, bs=2).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=3)
    x = torch.zeros(1, 5, 2)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_single_column():
    """
    Test input with a single column (H=7, W=1, bs=4).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=5)
    x = torch.zeros(7, 1, 4)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_zero_batch_size():
    """
    Test input with zero batch size (H=3, W=3, bs=0).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(3, 3, 0)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_max_embedding_index():
    """
    Test input with H=50, W=50 (maximum index for default embedding size).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(50, 50, 1)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_raises_on_too_large_h():
    """
    Test that input with H > 50 raises an error (embedding out of range).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=1)
    x = torch.zeros(51, 1, 1)
    nt = NestedTensor(x)
    with pytest.raises(IndexError):
        pe.forward(nt)

def test_forward_raises_on_too_large_w():
    """
    Test that input with W > 50 raises an error (embedding out of range).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=1)
    x = torch.zeros(1, 51, 1)
    nt = NestedTensor(x)
    with pytest.raises(IndexError):
        pe.forward(nt)

def test_forward_nonzero_device_cpu():
    """
    Test that the function works on CPU tensors.
    """
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(3, 3, 1)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")

def test_forward_dtype_float16():
    """
    Test that the function works with float16 tensors.
    """
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(4, 4, 1, dtype=torch.float16)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_dtype_float64():
    """
    Test that the function works with float64 tensors.
    """
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(4, 4, 1, dtype=torch.float64)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_non_contiguous_input():
    """
    Test that the function works with non-contiguous input tensors.
    """
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(4, 4, 2)
    x_t = x.transpose(0, 1)  # Now shape (4, 4, 2), but non-contiguous
    nt = NestedTensor(x_t)
    codeflash_output = pe.forward(nt); pos = codeflash_output

# --------------------- LARGE SCALE TEST CASES ---------------------


def test_forward_large_batch():
    """
    Test with a large batch size (H=10, W=10, bs=500).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=4)
    x = torch.zeros(10, 10, 500)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_large_num_pos_feats():
    """
    Test with a large num_pos_feats (H=5, W=5, bs=2, num_pos_feats=256).
    """
    pe = PositionEmbeddingLearned(num_pos_feats=256)
    x = torch.zeros(5, 5, 2)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_multiple_calls_consistency():
    """
    Test that multiple calls produce consistent results for the same input and weights.
    """
    pe = PositionEmbeddingLearned(num_pos_feats=3)
    x = torch.zeros(7, 7, 2)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos1 = codeflash_output
    codeflash_output = pe.forward(nt); pos2 = codeflash_output

def test_forward_different_inputs_different_outputs():
    """
    Test that different input shapes produce different outputs.
    """
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x1 = torch.zeros(2, 3, 1)
    x2 = torch.zeros(3, 2, 1)
    nt1 = NestedTensor(x1)
    nt2 = NestedTensor(x2)
    codeflash_output = pe.forward(nt1); pos1 = codeflash_output
    codeflash_output = pe.forward(nt2); pos2 = codeflash_output
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

import pytest  # used for our unit tests
import torch
from inference.v1.models.rfdetr.position_encoding import \
    PositionEmbeddingLearned
from torch import nn

# function to test
# ------------------------------------------------------------------------
# RF-DETR
# Copyright (c) 2025 Roboflow. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
# Copyright (c) 2024 Baidu. All Rights Reserved.
# ------------------------------------------------------------------------
# Modified from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR)
# Copyright (c) 2021 Microsoft. All Rights Reserved.
# ------------------------------------------------------------------------
# Copied from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# ------------------------------------------------------------------------


# Minimal NestedTensor implementation for testing
class NestedTensor:
    def __init__(self, tensors):
        self.tensors = tensors
from inference.v1.models.rfdetr.position_encoding import \
    PositionEmbeddingLearned

# unit tests

# ---------------- Basic Test Cases ----------------

def test_forward_basic_shape_and_type():
    # Test with typical 2D input, batch size 1, num_pos_feats=8 for speed
    pe = PositionEmbeddingLearned(num_pos_feats=8)
    x = torch.zeros(10, 20, 1)  # (H, W, bs)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_batch_size_greater_than_one():
    # Test with batch size > 1
    pe = PositionEmbeddingLearned(num_pos_feats=4)
    x = torch.zeros(5, 7, 3)  # (H, W, bs)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_nonzero_input():
    # Test with nonzero input to ensure input values do not affect output
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.randn(3, 4, 2)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos1 = codeflash_output
    x2 = torch.ones(3, 4, 2)
    nt2 = NestedTensor(x2)
    codeflash_output = pe.forward(nt2); pos2 = codeflash_output

def test_forward_different_num_pos_feats():
    # Test with different num_pos_feats
    for num_pos_feats in [1, 5, 16]:
        pe = PositionEmbeddingLearned(num_pos_feats=num_pos_feats)
        x = torch.zeros(6, 6, 2)
        nt = NestedTensor(x)
        codeflash_output = pe.forward(nt); pos = codeflash_output

# ---------------- Edge Test Cases ----------------

def test_forward_minimal_size():
    # Test with minimal allowed size (H=1, W=1, bs=1)
    pe = PositionEmbeddingLearned(num_pos_feats=3)
    x = torch.zeros(1, 1, 1)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_max_embedding_index():
    # Test with H or W at the embedding limit (49, since Embedding(50,...))
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(50, 50, 1)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_exceed_embedding_index_raises():
    # Test with H or W > 50, should raise IndexError
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(51, 10, 1)
    nt = NestedTensor(x)
    with pytest.raises(IndexError):
        codeflash_output = pe.forward(nt); _ = codeflash_output
    x2 = torch.zeros(10, 51, 1)
    nt2 = NestedTensor(x2)
    with pytest.raises(IndexError):
        codeflash_output = pe.forward(nt2); _ = codeflash_output


def test_forward_negative_shape_raises():
    # Negative shape is not allowed in torch
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    # Cannot create tensor with negative shape, so skip this test (torch will raise on tensor creation)
    with pytest.raises(RuntimeError):
        torch.zeros(-1, 10, 1)

def test_forward_device_consistency():
    # Test on CUDA if available
    if torch.cuda.is_available():
        pe = PositionEmbeddingLearned(num_pos_feats=2).cuda()
        x = torch.zeros(5, 5, 2, device='cuda')
        nt = NestedTensor(x)
        codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_dtype_consistency():
    # Test with float16 input
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(5, 5, 1, dtype=torch.float16)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_noncontiguous_input():
    # Test with non-contiguous input tensor
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(5, 5, 2)
    x_t = x.transpose(0, 1)  # Now (5, 5, 2) but non-contiguous
    nt = NestedTensor(x_t)
    codeflash_output = pe.forward(nt); pos = codeflash_output

# ---------------- Large Scale Test Cases ----------------


def test_forward_large_num_pos_feats():
    # Test with large num_pos_feats, still under 100MB
    pe = PositionEmbeddingLearned(num_pos_feats=128)
    # 20 x 20 x 5 x 256 = 512,000 floats = 2MB
    x = torch.zeros(20, 20, 5)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_large_batch_size():
    # Test with large batch size
    pe = PositionEmbeddingLearned(num_pos_feats=8)
    x = torch.zeros(10, 10, 100)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_performance_large():
    # This test is mainly for performance/scalability, not correctness
    import time
    pe = PositionEmbeddingLearned(num_pos_feats=16)
    x = torch.zeros(50, 50, 20)
    nt = NestedTensor(x)
    start = time.time()
    codeflash_output = pe.forward(nt); pos = codeflash_output
    elapsed = time.time() - start

# ---------------- Miscellaneous / Robustness ----------------

def test_forward_requires_grad():
    # Output should require grad if embeddings require grad
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(2, 2, 1)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos = codeflash_output

def test_forward_no_side_effects():
    # Calling forward twice should give same result (deterministic)
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(3, 3, 1)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); pos1 = codeflash_output
    codeflash_output = pe.forward(nt); pos2 = codeflash_output

def test_forward_export_flag_unused():
    # _export flag should not affect output
    pe = PositionEmbeddingLearned(num_pos_feats=2)
    x = torch.zeros(3, 3, 1)
    nt = NestedTensor(x)
    codeflash_output = pe.forward(nt); out1 = codeflash_output
    pe._export = True
    codeflash_output = pe.forward(nt); out2 = codeflash_output
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To edit these changes git checkout codeflash/optimize-pr1250-2025-05-14T12.23.01 and push.

(`feature/inference-v1-models`) **Optimization summary:** - Uses `.expand()` instead of `.repeat()` to minimize memory usage and runtime by avoiding actual data copying and repeated allocation. - Only unsqueezes and expands once per axis rather than inserting into a list and doing repeat. - Precomputes `device` to avoid repeated attribute lookups. - No functional changes, all return values exactly match original behavior. All comments unchanged unless reflecting code optimization for `.expand()`.

codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label May 14, 2025

codeflash-ai bot requested review from PawelPeczek-Roboflow, grzegorz-roboflow, yeldarby, probicheaux and hansent as code owners May 14, 2025 12:23

codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label May 14, 2025

codeflash-ai bot mentioned this pull request May 14, 2025

Add first scratches of new interface #1250

Open

4 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

⚡️ Speed up method `PositionEmbeddingLearned.forward` by 30% in PR #1250 (`feature/inference-v1-models`) #1274

⚡️ Speed up method `PositionEmbeddingLearned.forward` by 30% in PR #1250 (`feature/inference-v1-models`) #1274

Uh oh!

codeflash-ai bot commented May 14, 2025

Uh oh!

Uh oh!

⚡️ Speed up method PositionEmbeddingLearned.forward by 30% in PR #1250 (feature/inference-v1-models) #1274

Are you sure you want to change the base?

⚡️ Speed up method PositionEmbeddingLearned.forward by 30% in PR #1250 (feature/inference-v1-models) #1274

Uh oh!

Conversation

codeflash-ai bot commented May 14, 2025

⚡️ This pull request contains optimizations for PR #1250

📄 30% (0.30x) speedup for PositionEmbeddingLearned.forward in inference/v1/models/rfdetr/position_encoding.py

📝 Explanation and details

Uh oh!

Uh oh!

⚡️ Speed up method `PositionEmbeddingLearned.forward` by 30% in PR #1250 (`feature/inference-v1-models`) #1274

⚡️ Speed up method `PositionEmbeddingLearned.forward` by 30% in PR #1250 (`feature/inference-v1-models`) #1274

📄 30% (0.30x) speedup for `PositionEmbeddingLearned.forward` in `inference/v1/models/rfdetr/position_encoding.py`