Skip to content

[STAL-2746] Fix line/col search in string #443

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions LICENSE-3rdparty.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Component,Origin,License,Copyright
anyhow,https://crates.io/crates/anyhow,MIT,Copyright (c) 2019 David Tolnay
base64,https://github.com/marshallpierce/rust-base64,Apache-2.0,Copyright (c) 2015 Alice Maz
bstr,https://github.com/BurntSushi/bstr,MIT,Copyright (c) 2018-2019 Andrew Gallant
csv,https://github.com/BurntSushi/rust-csv,MIT,Copyright (c) 2015 Andrew Gallant
deno-core,https://github.com/denoland/deno,MIT,Copyright 2018-2023 the Deno authors
git2,https://crates.io/crates/git2,MIT,Copyright (c) 2014 Alex Crichton
Expand Down
6 changes: 5 additions & 1 deletion crates/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,9 @@ edition = "2021"
version.workspace = true

[dependencies]
anyhow = { workspace = true }
serde = { version = "1.0.203", features = ["derive"] }
derive_builder = { workspace = true }
derive_builder = { workspace = true }

# other
bstr = "1.9.1"
1 change: 1 addition & 0 deletions crates/common/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod analysis_options;
pub mod model;
pub mod utils;
6 changes: 6 additions & 0 deletions crates/common/src/model/position.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ pub struct Position {
pub col: u32,
}

impl Position {
pub fn new(line: u32, col: u32) -> Self {
Self { line, col }
}
}

impl fmt::Display for Position {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "position (line: {}, col: {})", self.line, self.col)
Expand Down
1 change: 1 addition & 0 deletions crates/common/src/utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod position_utils;
150 changes: 150 additions & 0 deletions crates/common/src/utils/position_utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
use crate::model::position::Position;
use bstr::BStr;
use bstr::ByteSlice;

/// Get position of an offset in a code and return a [Position].
pub fn get_position_in_string(content: &str, offset: usize) -> anyhow::Result<Position> {
if offset > content.len() {
anyhow::bail!("offset is larger than content length");
}

let bstr = BStr::new(&content);

let mut line_number: u32 = 1;
let lines = bstr.lines_with_terminator();
let mut last_end_index: usize = 0;
for line in lines {
let start_index = line.as_ptr() as usize - content.as_ptr() as usize;
let end_index = start_index + line.len();

if (start_index..end_index).contains(&offset) {
let mut col_number: u32 = 1;
for (grapheme_start, grapheme_end, _) in line.grapheme_indices() {
let grapheme_absolute_start = start_index + grapheme_start;
let grapheme_absolute_end = start_index + grapheme_end;

// It's exactly the index we are looking for.
if offset == grapheme_absolute_start {
return Ok(Position {
line: line_number,
col: col_number,
});
}

// The offset is within the grapheme we are looking for, it's the next col.
if (grapheme_absolute_start..grapheme_absolute_end).contains(&offset) {
return Ok(Position {
line: line_number,
col: col_number + 1,
});
}
col_number += 1;
}
}
line_number += 1;
last_end_index = end_index;
}

// We are on the last character
if last_end_index > 0 && last_end_index == offset {
return Ok(Position {
line: line_number,
col: 1,
});
}

Err(anyhow::anyhow!("cannot find position"))
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_get_position_in_string() {
assert_eq!(
get_position_in_string("foobarbaz", 3).unwrap(),
Position::new(1, 4)
);
}

#[test]
fn test_get_position_in_string_out_of_bounds() {
assert!(get_position_in_string("foobarbaz", 42).is_err());
}

#[test]
fn test_grapheme() {
let text = "The quick brown\n🦊 jumps over\nthe lazy 🐕\n";
assert_eq!(
get_position_in_string(text, 16).unwrap(),
Position::new(2, 1)
);
assert_eq!(
get_position_in_string(text, 18).unwrap(),
Position::new(2, 2)
);
assert_eq!(
get_position_in_string(text, 41).unwrap(),
Position::new(3, 10)
);
assert_eq!(
get_position_in_string(text, 43).unwrap(),
Position::new(3, 11)
);
}

#[test]
fn test_point_midline() {
let text = "The quick brown\nfox jumps over\nthe lazy dog";
assert_eq!(
get_position_in_string(text, 6).unwrap(),
Position::new(1, 7)
);
assert_eq!(
get_position_in_string(text, 7).unwrap(),
Position::new(1, 8)
);
assert_eq!(
get_position_in_string(text, 8).unwrap(),
Position::new(1, 9)
);
assert_eq!(
get_position_in_string(text, 24).unwrap(),
Position::new(2, 9)
);
assert_eq!(
get_position_in_string(text, 23).unwrap(),
Position::new(2, 8)
);
assert_eq!(
get_position_in_string(text, 22).unwrap(),
Position::new(2, 7)
);
assert_eq!(
get_position_in_string(text, 39).unwrap(),
Position::new(3, 9)
);
assert_eq!(
get_position_in_string(text, 37).unwrap(),
Position::new(3, 7)
);
assert_eq!(
get_position_in_string(text, 38).unwrap(),
Position::new(3, 8)
);
}

#[test]
fn point_slice_boundary() {
let text = "The quick brown\nfox jumps over\nthe lazy dog\n";
assert_eq!(
get_position_in_string(text, 0).unwrap(),
Position::new(1, 1)
);
assert_eq!(
get_position_in_string(text, text.len()).unwrap(),
Position::new(4, 1)
);
}
}
2 changes: 2 additions & 0 deletions crates/secrets/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ common = { package = "common", path = "../common" }
itertools = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }

# remote
sds = { git = "https://github.com/DataDog/dd-sensitive-data-scanner.git", tag = "v0.1.2", package = "dd-sds" }
23 changes: 2 additions & 21 deletions crates/secrets/src/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

use crate::model::secret_result::{SecretResult, SecretResultMatch};
use crate::model::secret_rule::SecretRule;
use anyhow::{anyhow, Error};
use anyhow::Error;
use common::analysis_options::AnalysisOptions;
use common::model::position::Position;
use common::utils::position_utils::get_position_in_string;
use itertools::Itertools;
use sds::{RuleConfig, Scanner};

Expand All @@ -22,26 +23,6 @@ pub fn build_sds_scanner(rules: &[SecretRule]) -> Scanner {
Scanner::new(&sds_rules).expect("error when instantiating the scanner")
}

/// Get position of an offset in a code and return a [[Position]]. This code should
/// ultimately be more efficient as we grow the platform, it's considered as "good enough" for now.
pub fn get_position_in_string(content: &str, offset: usize) -> anyhow::Result<Position> {
let mut line_number = 1;
let mut bytes_reads = 0;

for line in content.lines() {
if offset >= bytes_reads && offset <= bytes_reads + line.len() {
let c = offset - bytes_reads + 1;
return Ok(Position {
line: line_number,
col: c as u32,
});
}
line_number += 1;
bytes_reads = bytes_reads + line.len() + 1;
}
Err(anyhow!("line not found"))
}

/// Find secrets in code. This is the main entrypoint for our SDS integration.
pub fn find_secrets(
scanner: &Scanner,
Expand Down
Loading