Merge branch 'v0.19.0-rc' into auto-v0.19.0-rc-952633dcc2d95d8ba14a805b627332ba1d998ecd

lmatz · web-flow · commit a5affa618485 · 2023-06-19T14:26:27.000+08:00
diff --git a/e2e_test/source/cdc/cdc.check.slt b/e2e_test/source/cdc/cdc.check.slt
@@ -26,3 +26,14 @@ query V
 select count(*) as cnt from mytable;
 ----
 4
+
+# Skipped due to https://github.com/risingwavelabs/risingwave/issues/10206
+# query IIII
+# select count(*) from orders_2;
+# ----
+# 3
+
+query IIII
+select count(*) from shipments_2;
+----
+3
diff --git a/e2e_test/source/cdc/cdc.load.slt b/e2e_test/source/cdc/cdc.load.slt
@@ -87,3 +87,39 @@ create table mytable (
  table.name = 'mytable',
  server.id = '5087'
 );
+
+# Some columns missing and reordered (mysql-cdc)
+statement ok
+create table orders_2 (
+   order_id int,
+   price decimal,
+   customer_name string,
+   PRIMARY KEY (order_id)
+) with (
+ connector = 'mysql-cdc',
+ hostname = 'mysql',
+ port = '3306',
+ username = 'root',
+ password = '123456',
+ database.name = 'mydb',
+ table.name = 'orders',
+ server.id = '5087'
+);
+
+# Some columns missing and reordered (postgres-cdc)
+statement ok
+create table shipments_2 (
+  origin STRING,
+  destination STRING,
+  shipment_id INTEGER,
+  order_id INTEGER,
+ PRIMARY KEY (shipment_id)
+) with (
+ connector = 'postgres-cdc',
+ hostname = 'db',
+ port = '5432',
+ username = 'postgres',
+ password = 'postgres',
+ database.name = 'cdc_test',
+ table.name = 'shipments'
+);
diff --git a/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/MySqlValidator.java b/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/MySqlValidator.java
@@ -20,6 +20,7 @@
 import java.sql.Connection;
 import java.sql.DriverManager;
 import java.sql.SQLException;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 
@@ -127,31 +128,32 @@ private void validateTableSchema() throws SQLException {
                 jdbcConnection.prepareStatement(ValidatorUtils.getSql("mysql.table_schema"))) {
             stmt.setString(1, userProps.get(DbzConnectorConfig.DB_NAME));
             stmt.setString(2, userProps.get(DbzConnectorConfig.TABLE_NAME));
-            var res = stmt.executeQuery();
+
+            // Field name in lower case -> data type
+            var schema = new HashMap<String, String>();
             var pkFields = new HashSet<String>();
-            int index = 0;
+            var res = stmt.executeQuery();
             while (res.next()) {
                 var field = res.getString(1);
                 var dataType = res.getString(2);
                 var key = res.getString(3);
-
-                if (index >= tableSchema.getNumColumns()) {
-                    throw ValidatorUtils.invalidArgument("The number of columns mismatch");
+                schema.put(field.toLowerCase(), dataType);
+                if (key.equalsIgnoreCase("PRI")) {
+                    // RisingWave always use lower case for column name
+                    pkFields.add(field.toLowerCase());
                 }
+            }
 
-                var srcColName = tableSchema.getColumnNames()[index++];
-                if (!srcColName.equalsIgnoreCase(field)) {
+            // All columns defined must exist in upstream database
+            for (var e : tableSchema.getColumnTypes().entrySet()) {
+                var pgDataType = schema.get(e.getKey().toLowerCase());
+                if (pgDataType == null) {
                     throw ValidatorUtils.invalidArgument(
-                            String.format("column name mismatch: %s, [%s]", field, srcColName));
+                            "Column '" + e.getKey() + "' not found in the upstream database");
                 }
-
-                if (!isDataTypeCompatible(dataType, tableSchema.getColumnType(srcColName))) {
+                if (!isDataTypeCompatible(pgDataType, e.getValue())) {
                     throw ValidatorUtils.invalidArgument(
-                            String.format("incompatible data type of column %s", srcColName));
-                }
-                if (key.equalsIgnoreCase("PRI")) {
-                    // RisingWave always use lower case for column name
-                    pkFields.add(field.toLowerCase());
+                            "Incompatible data type of column " + e.getKey());
                 }
             }
 
diff --git a/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/PostgresValidator.java b/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/PostgresValidator.java
@@ -132,29 +132,32 @@ private void validateTableSchema() throws SQLException {
                 throw ValidatorUtils.invalidArgument("Primary key mismatch");
             }
         }
-        // check whether source schema match table schema on upstream
+
+        // Check whether source schema match table schema on upstream
+        // All columns defined must exist in upstream database
         try (var stmt =
                 jdbcConnection.prepareStatement(ValidatorUtils.getSql("postgres.table_schema"))) {
             stmt.setString(1, schemaName);
             stmt.setString(2, tableName);
             var res = stmt.executeQuery();
-            int index = 0;
+
+            // Field name in lower case -> data type
+            Map<String, String> schema = new HashMap<>();
             while (res.next()) {
                 var field = res.getString(1);
                 var dataType = res.getString(2);
-                if (index >= tableSchema.getNumColumns()) {
-                    throw ValidatorUtils.invalidArgument("The number of columns mismatch");
-                }
+                schema.put(field.toLowerCase(), dataType);
+            }
 
-                var srcColName = tableSchema.getColumnNames()[index++];
-                if (!srcColName.equalsIgnoreCase(field)) {
+            for (var e : tableSchema.getColumnTypes().entrySet()) {
+                var pgDataType = schema.get(e.getKey().toLowerCase());
+                if (pgDataType == null) {
                     throw ValidatorUtils.invalidArgument(
-                            "table column defined in the source mismatches upstream column "
-                                    + field);
+                            "Column '" + e.getKey() + "' not found in the upstream database");
                 }
-                if (!isDataTypeCompatible(dataType, tableSchema.getColumnType(srcColName))) {
+                if (!isDataTypeCompatible(pgDataType, e.getValue())) {
                     throw ValidatorUtils.invalidArgument(
-                            "incompatible data type of column " + srcColName);
+                            "Incompatible data type of column " + e.getKey());
                 }
             }
         }
diff --git a/src/connector/src/parser/debezium/avro_parser.rs b/src/connector/src/parser/debezium/avro_parser.rs
@@ -26,6 +26,7 @@ use risingwave_common::error::{Result, RwError};
 use risingwave_pb::plan_common::ColumnDesc;
 
 use super::operators::*;
+use crate::common::UpsertMessage;
 use crate::impl_common_parser_logic;
 use crate::parser::avro::util::{
     avro_field_to_column_desc, extract_inner_field_schema, from_avro_value,
@@ -192,42 +193,57 @@ impl DebeziumAvroParser {
         payload: Vec<u8>,
         mut writer: SourceStreamChunkRowWriter<'_>,
     ) -> Result<WriteGuard> {
+        // https://debezium.io/documentation/reference/stable/transformations/event-flattening.html#event-flattening-behavior:
+        //
+        // A database DELETE operation causes Debezium to generate two Kafka records:
+        // - A record that contains "op": "d", the before row data, and some other fields.
+        // - A tombstone record that has the same key as the deleted row and a value of null. This
+        // record is a marker for Apache Kafka. It indicates that log compaction can remove
+        // all records that have this key.
+
+        let UpsertMessage {
+            primary_key: key,
+            record: payload,
+        } = bincode::deserialize(&payload[..]).unwrap();
+
+        // If message value == null, it must be a tombstone message. Emit DELETE to downstream using
+        // message key as the DELETE row. Throw an error if message key is empty.
+        if payload.is_empty() {
+            let (schema_id, mut raw_payload) = extract_schema_id(&key)?;
+            let key_schema = self.schema_resolver.get(schema_id).await?;
+            let key = from_avro_datum(key_schema.as_ref(), &mut raw_payload, None)
+                .map_err(|e| RwError::from(ProtocolError(e.to_string())))?;
+            return writer.delete(|column| {
+                let field_schema =
+                    extract_inner_field_schema(&self.inner_schema, Some(&column.name))?;
+                from_avro_value(
+                    get_field_from_avro_value(&key, column.name.as_str())?.clone(),
+                    field_schema,
+                )
+            });
+        }
+
         let (schema_id, mut raw_payload) = extract_schema_id(&payload)?;
         let writer_schema = self.schema_resolver.get(schema_id).await?;
-
         let avro_value = from_avro_datum(writer_schema.as_ref(), &mut raw_payload, None)
             .map_err(|e| RwError::from(ProtocolError(e.to_string())))?;
-
         let op = get_field_from_avro_value(&avro_value, OP)?;
+
         if let Value::String(op_str) = op {
             match op_str.as_str() {
-                DEBEZIUM_UPDATE_OP => {
-                    let before = get_field_from_avro_value(&avro_value, BEFORE)
-                        .map_err(|_| {
-                            RwError::from(ProtocolError(
-                                "before is missing for updating event. If you are using postgres, you may want to try ALTER TABLE $TABLE_NAME REPLICA IDENTITY FULL;".to_string(),
-                            ))
-                        })?;
-                    let after = get_field_from_avro_value(&avro_value, AFTER)?;
-
-                    writer.update(|column| {
-                        let field_schema =
-                            extract_inner_field_schema(&self.inner_schema, Some(&column.name))?;
-                        let before = from_avro_value(
-                            get_field_from_avro_value(before, column.name.as_str())?.clone(),
-                            field_schema,
-                        )?;
-                        let after = from_avro_value(
-                            get_field_from_avro_value(after, column.name.as_str())?.clone(),
-                            field_schema,
-                        )?;
+                DEBEZIUM_CREATE_OP | DEBEZIUM_UPDATE_OP | DEBEZIUM_READ_OP => {
+                    // - If debezium op == CREATE, emit INSERT to downstream using the after field
+                    //   in the debezium value as the INSERT row.
+                    // - If debezium op == UPDATE, emit INSERT to downstream using the after field
+                    //   in the debezium value as the INSERT row.
 
-                        Ok((before, after))
-                    })
-                }
-                DEBEZIUM_CREATE_OP | DEBEZIUM_READ_OP => {
                     let after = get_field_from_avro_value(&avro_value, AFTER)?;
-
+                    if *after == Value::Null {
+                        return Err(RwError::from(ProtocolError(format!(
+                            "after is null for {} event",
+                            op_str
+                        ))));
+                    }
                     writer.insert(|column| {
                         let field_schema =
                             extract_inner_field_schema(&self.inner_schema, Some(&column.name))?;
@@ -238,12 +254,20 @@ impl DebeziumAvroParser {
                     })
                 }
                 DEBEZIUM_DELETE_OP => {
+                    // If debezium op == DELETE, emit DELETE to downstream using the before field as
+                    // the DELETE row.
+
                     let before = get_field_from_avro_value(&avro_value, BEFORE)
                         .map_err(|_| {
                             RwError::from(ProtocolError(
-                                "before is missing for updating event. If you are using postgres, you may want to try ALTER TABLE $TABLE_NAME REPLICA IDENTITY FULL;".to_string(),
+                                "before is missing for the Debezium delete op. If you are using postgres, you may want to try ALTER TABLE $TABLE_NAME REPLICA IDENTITY FULL;".to_string(),
                             ))
                         })?;
+                    if *before == Value::Null {
+                        return Err(RwError::from(ProtocolError(
+                            "before is null for DELETE event".to_string(),
+                        )));
+                    }
 
                     writer.delete(|column| {
                         let field_schema =
diff --git a/src/connector/src/parser/mod.rs b/src/connector/src/parser/mod.rs
@@ -426,7 +426,9 @@ impl SpecificParserConfig {
     pub fn is_upsert(&self) -> bool {
         matches!(
             self,
-            SpecificParserConfig::UpsertJson | SpecificParserConfig::UpsertAvro(_)
+            SpecificParserConfig::UpsertJson
+                | SpecificParserConfig::UpsertAvro(_)
+                | SpecificParserConfig::DebeziumAvro(_)
         )
     }
 

Original file line number	Diff line number	Diff line change
`@@ -426,7 +426,9 @@ impl SpecificParserConfig {`
`426`	`426`	`pub fn is_upsert(&self) -> bool {`
`427`	`427`	`matches!(`
`428`	`428`	`self,`
`429`		`- SpecificParserConfig::UpsertJson \| SpecificParserConfig::UpsertAvro(_)`
	`429`	`+ SpecificParserConfig::UpsertJson`
	`430`	`+ \| SpecificParserConfig::UpsertAvro(_)`
	`431`	`+ \| SpecificParserConfig::DebeziumAvro(_)`
`430`	`432`	`)`
`431`	`433`	`}`
`432`	`434`