-
Notifications
You must be signed in to change notification settings - Fork 0
Summary upload #50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Summary upload #50
Changes from 23 commits
fba5c42
9b0c82a
0aba15d
dbe401a
fbb7ffd
c503295
2f11cb0
2712d19
8eb6042
8b0d460
f7ff3c3
c50c873
5d59bbf
0956227
697afbc
4848221
cb20bb1
425adb9
9b231ee
90c157d
54cda0c
5f1b30b
db22935
4f9882a
24375e6
f15ab1f
b8c0d8c
c1bbe1f
835ab23
673cb1e
f966868
3c50b70
bc494fc
1b9f275
9c46dbc
27fdc18
1405872
3fe25fe
df6ccc1
2e4d2df
57c0291
a6447c8
d7860ee
4e18f18
b5d08b8
23e710f
f4d27fc
254e6c4
9a9ac67
cd7cd8e
ba9ea0c
6e75bcd
013c7d6
fb74aba
f920ef2
436a063
1acf3b5
4a8e5f9
40c52df
591c5ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,8 +18,8 @@ package ai.chronon.aggregator.test | |
|
||
import ai.chronon.aggregator.base.ApproxPercentiles | ||
import ai.chronon.aggregator.row.StatsGenerator | ||
import com.yahoo.sketches.kll.KllFloatsSketch | ||
import junit.framework.TestCase | ||
import org.apache.datasketches.kll.KllFloatsSketch | ||
import org.junit.Assert._ | ||
import org.slf4j.Logger | ||
import org.slf4j.LoggerFactory | ||
|
@@ -28,7 +28,8 @@ import scala.util.Random | |
|
||
class ApproxPercentilesTest extends TestCase { | ||
@transient lazy val logger: Logger = LoggerFactory.getLogger(getClass) | ||
def testBasicImpl(nums: Int, slide: Int, k: Int, percentiles: Array[Double], errorPercent: Float): Unit = { | ||
|
||
def basicImplTestHelper(nums: Int, slide: Int, k: Int, percentiles: Array[Double], errorPercent: Float): Unit = { | ||
val sorted = (0 to nums).map(_.toFloat) | ||
val elems = Random.shuffle(sorted.toList).toArray | ||
val chunks = elems.sliding(slide, slide) | ||
|
@@ -58,14 +59,14 @@ class ApproxPercentilesTest extends TestCase { | |
def testBasicPercentiles: Unit = { | ||
val percentiles_tested: Int = 31 | ||
val percentiles: Array[Double] = (0 to percentiles_tested).toArray.map(i => i * 1.0 / percentiles_tested) | ||
testBasicImpl(3000, 5, 100, percentiles, errorPercent = 4) | ||
testBasicImpl(30000, 50, 200, percentiles, errorPercent = 2) | ||
testBasicImpl(30000, 50, 50, percentiles, errorPercent = 5) | ||
basicImplTestHelper(3000, 5, 100, percentiles, errorPercent = 4) | ||
basicImplTestHelper(30000, 50, 200, percentiles, errorPercent = 2) | ||
basicImplTestHelper(30000, 50, 50, percentiles, errorPercent = 5) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Consider adding edge case tests While the current test cases cover a good range of scenarios, consider adding tests for edge cases:
Example addition: basicImplTestHelper(0, 1, 50, Array(0.0, 1.0), errorPercent = 1)
basicImplTestHelper(1, 1, 2, Array(0.0, 0.5, 1.0), errorPercent = 1) |
||
} | ||
|
||
def getPSIDrift(sample1: Array[Float], sample2: Array[Float]): Double = { | ||
val sketch1 = new KllFloatsSketch(200) | ||
val sketch2 = new KllFloatsSketch(200) | ||
val sketch1 = KllFloatsSketch.newHeapInstance(200) | ||
val sketch2 = KllFloatsSketch.newHeapInstance(200) | ||
sample1.map(sketch1.update) | ||
sample2.map(sketch2.update) | ||
val drift = StatsGenerator.PSIKllSketch(sketch1.toByteArray, sketch2.toByteArray).asInstanceOf[Double] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,12 @@ abstract class CStream[+T: ClassTag] { | |
else min + ((max - min) * math.random) | ||
} | ||
|
||
protected def rollFloat(max: JDouble, min: JDouble = 0, nullRate: Double = 0.1): java.lang.Float = { | ||
val dice: Double = math.random | ||
if (dice < nullRate) null | ||
else (min + ((max - min) * math.random)).toFloat | ||
} | ||
chewy-zlai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// roll a dice that gives max to min uniformly, with nulls interspersed as per null rate | ||
protected def roll(max: JLong, min: JLong = 0, nullRate: Double = 0.1): JLong = { | ||
val roll = rollDouble(max.toDouble, min.toDouble, nullRate) | ||
|
@@ -45,9 +51,9 @@ abstract class CStream[+T: ClassTag] { | |
Stream.fill(count)(next()) | ||
} | ||
|
||
def chunk(minSize: Long = 0, maxSize: Long = 10, nullRate: Double = 0.1): CStream[Seq[T]] = { | ||
def chunk(minSize: Long = 0, maxSize: Long = 10, nullRate: Double = 0.1): CStream[Any] = { | ||
def innerNext(): T = next() | ||
new CStream[Seq[T]] { | ||
new CStream[Any] { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider preserving type safety in the Changing the return type of Apply this diff to restore the original return type and maintain type safety: -def chunk(minSize: Long = 0, maxSize: Long = 10, nullRate: Double = 0.1): CStream[Any] = {
+def chunk(minSize: Long = 0, maxSize: Long = 10, nullRate: Double = 0.1): CStream[Seq[T]] = {
def innerNext(): T = next()
new CStream[Seq[T]] {
override def next(): Seq[T] = {
|
||
override def next(): Seq[T] = { | ||
val size = roll(minSize, maxSize, nullRate) | ||
if (size != null) { | ||
|
@@ -58,17 +64,36 @@ abstract class CStream[+T: ClassTag] { | |
} | ||
} | ||
} | ||
|
||
def zipChunk[Other](other: CStream[Other], minSize: Int = 0, maxSize: Int = 20, nullRate: Double = 0.1): CStream[Any] = { | ||
def nextKey(): T = next() | ||
def nextValue(): Other = other.next() | ||
|
||
new CStream[Any] { | ||
override def next(): Map[T, Other] = { | ||
val size = roll(minSize, maxSize, nullRate) | ||
if (size != null) { | ||
(0 until size.toInt).map { _ => nextKey() -> nextValue() }.toMap | ||
} else { | ||
null | ||
} | ||
} | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Preserve type information in the The Apply this diff to update the return type and maintain type safety: -def zipChunk[Other](other: CStream[Other], minSize: Int = 0, maxSize: Int = 20, nullRate: Double = 0.1): CStream[Any] = {
+def zipChunk[Other](other: CStream[Other], minSize: Int = 0, maxSize: Int = 20, nullRate: Double = 0.1): CStream[Map[T, Other]] = {
def nextKey(): T = next()
def nextValue(): Other = other.next()
new CStream[Map[T, Other]] {
- override def next(): Map[T, Other] = {
+ override def next(): Map[T, Other] = {
|
||
} | ||
|
||
object CStream { | ||
private type JLong = java.lang.Long | ||
private type JDouble = java.lang.Double | ||
private type JFloat = java.lang.Float | ||
|
||
def genTimestamps(window: Window, | ||
count: Int, | ||
roundMillis: Int = 1, | ||
maxTs: Long = System.currentTimeMillis()): Array[Long] = | ||
new CStream.TimeStream(window, roundMillis, maxTs).gen(count).toArray.sorted | ||
new CStream.TimeStream(window, roundMillis, maxTs).gen(count).toArray.sorted(new Ordering[Any] { | ||
override def compare(x: Any, y: Any): Int = x.asInstanceOf[Long].compareTo(y.asInstanceOf[Long]) | ||
}) | ||
chewy-zlai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def genPartitions(count: Int, partitionSpec: PartitionSpec): Array[String] = { | ||
val today = partitionSpec.at(System.currentTimeMillis()) | ||
|
@@ -121,6 +146,11 @@ object CStream { | |
Option(rollDouble(max, 1, nullRate = nullRate)).map(java.lang.Double.valueOf(_)).orNull | ||
} | ||
|
||
class FloatStream(max: Double = 10000, nullRate: Double = 0.1) extends CStream[JFloat] { | ||
override def next(): java.lang.Float = | ||
Option(rollFloat(max, 1, nullRate = nullRate)).map(java.lang.Float.valueOf(_)).orNull | ||
} | ||
chewy-zlai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
class ZippedStream(streams: CStream[Any]*)(tsIndex: Int) extends CStream[TestRow] { | ||
override def next(): TestRow = | ||
new TestRow(streams.map(_.next()).toArray: _*)(tsIndex) | ||
|
@@ -139,7 +169,7 @@ object CStream { | |
} | ||
|
||
case class Column(name: String, `type`: DataType, cardinality: Int, chunkSize: Int = 10, nullRate: Double = 0.1) { | ||
def genImpl(dtype: DataType, partitionColumn: String, partitionSpec: PartitionSpec): CStream[Any] = | ||
def genImpl(dtype: DataType, partitionColumn: String, partitionSpec: PartitionSpec, nullRate: Double): CStream[Any] = | ||
dtype match { | ||
case StringType => | ||
name match { | ||
|
@@ -148,18 +178,23 @@ case class Column(name: String, `type`: DataType, cardinality: Int, chunkSize: I | |
} | ||
case IntType => new IntStream(cardinality, nullRate) | ||
case DoubleType => new DoubleStream(cardinality, nullRate) | ||
case FloatType => new FloatStream(cardinality, nullRate) | ||
case LongType => | ||
name match { | ||
case Constants.TimeColumn => new TimeStream(new Window(cardinality, TimeUnit.DAYS)) | ||
case _ => new LongStream(cardinality, nullRate) | ||
} | ||
case ListType(elementType) => | ||
genImpl(elementType, partitionColumn, partitionSpec).chunk(chunkSize) | ||
genImpl(elementType, partitionColumn, partitionSpec, nullRate).chunk(chunkSize) | ||
case MapType(keyType, valueType) => | ||
val keyStream = genImpl(keyType, partitionColumn, partitionSpec, 0) | ||
val valueStream = genImpl(valueType, partitionColumn, partitionSpec, nullRate) | ||
keyStream.zipChunk(valueStream, maxSize = chunkSize) | ||
case otherType => throw new UnsupportedOperationException(s"Can't generate random data for $otherType yet.") | ||
} | ||
|
||
def gen(partitionColumn: String, partitionSpec: PartitionSpec): CStream[Any] = | ||
genImpl(`type`, partitionColumn, partitionSpec) | ||
genImpl(`type`, partitionColumn, partitionSpec, nullRate) | ||
def schema: (String, DataType) = name -> `type` | ||
} | ||
case class RowsWithSchema(rows: Array[TestRow], schema: Seq[(String, DataType)]) |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -17,8 +17,8 @@ | |||||||||
|
||||||||||
set -o xtrace | ||||||||||
|
||||||||||
export CHRONON_VERSION_STR=$1 | ||||||||||
export CHRONON_BRANCH_STR=$2 | ||||||||||
export VERSION=$1 | ||||||||||
export BRANCH=$2 | ||||||||||
ACTION=$3 | ||||||||||
|
||||||||||
echo "Finding working directory.." | ||||||||||
|
@@ -37,8 +37,8 @@ elif [[ "${ACTION}" == "release" ]]; then | |||||||||
PYPI_REPOSITORY="chronon-pypi" | ||||||||||
# Sanity checks, git state, Run Tests, Build, Release | ||||||||||
# Make sure the version string doesn't contain SNAPSHOT if so it signifies development build and cannot be released. | ||||||||||
if [[ "${CHRONON_VERSION_STR}" == *"SNAPSHOT"* ]]; then | ||||||||||
echo "Python releases cannot be done for in development versions. Version: ${CHRONON_VERSION_STR}" | ||||||||||
if [[ "${VERSION}" == *"SNAPSHOT"* ]]; then | ||||||||||
echo "Python releases cannot be done for in development versions. Version: ${VERSION}" | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix variable name inconsistency in version check. The script exports Apply this fix: - if [[ "${VERSION}" == *"SNAPSHOT"* ]]; then
- echo "Python releases cannot be done for in development versions. Version: ${VERSION}"
+ if [[ "${CHRONON_VERSION_STR}" == *"SNAPSHOT"* ]]; then
+ echo "Python releases cannot be done for in development versions. Version: ${CHRONON_VERSION_STR}" 📝 Committable suggestion
Suggested change
💡 Codebase verification I can now provide the final response based on the analysis of the code: Bug found: VERSION variable mismatch in python-api-build.sh There's an inconsistency in the version variable usage in python-api-build.sh:
This is a bug as the script exports and uses different variable names for the same purpose, which will cause the SNAPSHOT version check to fail silently since 🔗 Analysis chainVerify version variable usage across related files. Let's ensure consistent version variable naming across related files. Let me analyze the version variable usage more deeply by checking the actual implementation and usage context. The results will help us understand if there's any inconsistency in version handling between these files and if the VERSION check in python-api-build.sh is correctly implemented. 🏁 Scripts executedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Description: Check for inconsistent version variable usage
# Expected: All files should use the same version variable name
# Search for both version variable names in Python and shell scripts
echo "Checking version variable usage in setup.py and related files:"
rg -l "CHRONON_VERSION_STR|VERSION" --type py --type sh
echo -e "\nChecking specific version variable usage:"
rg "(CHRONON_VERSION_STR|VERSION)\s*=" --type py --type sh
Length of output: 672 Script: #!/bin/bash
# Check how VERSION is used in python-api-build.sh
echo "Content of python-api-build.sh:"
cat api/py/python-api-build.sh
echo -e "\nChecking VERSION usage in setup.py:"
cat api/py/setup.py
Length of output: 4855 |
||||||||||
exit 1 | ||||||||||
fi | ||||||||||
echo "Running tests, git check, build and release..." | ||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,8 +29,8 @@ | |
__version__ = "local" | ||
__branch__ = "main" | ||
def get_version(): | ||
version_str = os.environ.get("CHRONON_VERSION_STR", __version__) | ||
branch_str = os.environ.get("CHRONON_BRANCH_STR", __branch__) | ||
version_str = os.environ.get("VERSION", __version__) | ||
branch_str = os.environ.get("BRANCH", __branch__) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Codebase verification Update required in python-api-build.sh script for environment variable changes The verification has identified that
🔗 Analysis chainEnvironment variable names have been simplified. The change to use simpler environment variable names ( Let's verify the impact of this change: Consider the following to ensure a smooth transition:
Would you like me to help create a GitHub issue to track these tasks? Let me check the content of the identified shell script to understand the impact better. Let me check if there are any CI/CD configuration files that might be using these environment variables. 🏁 Scripts executedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Description: Check for any references to the old environment variable names
# that need to be updated.
echo "Checking for old environment variable references..."
rg -l "CHRONON_VERSION_STR|CHRONON_BRANCH_STR"
echo "Checking for any shell scripts that might be setting these variables..."
fd -e sh -x grep -l "CHRONON_VERSION_STR|CHRONON_BRANCH_STR" {}
Length of output: 409 Script: #!/bin/bash
# Check the content of the build script that uses these variables
cat api/py/python-api-build.sh
Length of output: 2204 Script: #!/bin/bash
# Check for CI/CD configuration files that might reference these variables
echo "Checking GitHub Actions workflows..."
fd -e yml -e yaml -p '.github/workflows' -x grep -l "CHRONON_VERSION_STR\|CHRONON_BRANCH_STR" {}
echo "Checking for other potential CI/CD configs..."
fd -e yml -e yaml -x grep -l "CHRONON_VERSION_STR\|CHRONON_BRANCH_STR" {}
echo "Checking for any Dockerfiles..."
fd Dockerfile -x grep -l "CHRONON_VERSION_STR\|CHRONON_BRANCH_STR" {}
Length of output: 507 |
||
# Replace "-SNAPSHOT" with ".dev" | ||
version_str = version_str.replace("-SNAPSHOT", ".dev") | ||
# If the prefix is the branch name, then convert it as suffix after '+' to make it Python PEP440 complaint | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Consider parameter validation and functional improvements.
While the documentation is good, the method could benefit from parameter validation and a more functional approach.
Consider these improvements:
Also applies to: 170-173