Intel-tensorflow
diff --git a/‎tensorflow/core/protobuf/tpu/optimization_parameters.proto
Lines changed: 69 additions & 27 deletions b/‎tensorflow/core/protobuf/tpu/optimization_parameters.proto
Lines changed: 69 additions & 27 deletions
diff --git a/‎tensorflow/core/tpu/BUILD
Lines changed: 3 additions & 0 deletions b/‎tensorflow/core/tpu/BUILD
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorflow/core/tpu/graph_rewrite/BUILD
Lines changed: 1 addition & 1 deletion b/‎tensorflow/core/tpu/graph_rewrite/BUILD
Lines changed: 1 addition & 1 deletion
@@ -34,45 +34,40 @@ message SimulatedQuantization {
   int32 num_buckets = 3;
 }
 
-// Dynamic learning rate specification in the TPUEmbeddingConfiguration. The
-// actual learning rates are provided as a scalar input list to the
+// Dynamic input specification for optimizers in the TPUEmbeddingConfiguration.
+// The actual dynamic inputs are provided as a scalar input list to the
 // SendTPUEmbeddingGradients Op indexed by their tag specified through the
 // following proto.
-message DynamicLearningRate {
-  // For tables where learning rates are dynamically computed and communicated
-  // to the TPU embedding program, a tag must be specified for the learning
-  // rate.
+message OptimizerDynamicInput {
+  // For tables where dynamic inputs are needed (e.g., learning rates or other
+  // dynamic hyperparameters used in optimizers), a tag must be specified for
+  // the input.
   //
-  // The tag must be a non-negative  integer. The total number of unique tags
-  // must be less than or equal to the number of tables in the TPU embedding
-  // configuration (a table does not specify any tag if it uses a constant
-  // learning rate, and specifies exactly one tag if it uses dynamic learning
-  // rates).
-  //
-  // All tags in the range [0, number_of_unique_tags) must be present in the TPU
-  // embedding configuration, i.e. a tag cannot be skipped if a different tag
-  // numerically greater than it is used in the configuration.
+  // The tag must be a non-negative  integer. All tags in the range
+  // [0, number_of_unique_tags) must be present in the TPU embedding
+  // configuration, i.e. a tag cannot be skipped if a different tag numerically
+  // greater than it is used in the configuration.
   //
   // If multiple tables specify the same tag, they *MUST* have
-  // the same dynamic learning rate, for example, their dynamic learning rate
-  // could be computed by the same TensorFlow sub-graph. The partitioning of the
+  // the same dynamic input, for example, their dynamic learning rate could be
+  // computed by the same TensorFlow sub-graph. The partitioning of the
   // embedding layer would be more optimal if the number_of_unique_tags is as
   // *LOW* as possible, i.e., if many tables share the same tag.
   //
-  // The learning_rate input of the SendTPUEmbeddingGradients op is used to
-  // communicate dynamic learning rates to the TPU embedding program.
-  // The learning_rate input is a list of scalars where the size of the list is
-  // equal to the number of unique tags. The learning rate associated with a
-  // particular tag is specified by populating its corresponding index in the
-  // list of learning_rate scalars.
+  // The hyper_parameters input of the SendTPUEmbeddingGradients op is used to
+  // communicate dynamic hyper-parameters to the TPU embedding program.
+  // The hyper_parameters input is a list of scalars where the size of the list
+  // is equal to the number of unique tags. The hyper-parameter associated with
+  // a particular tag is specified by populating its corresponding index in the
+  // list of scalars.
   int32 tag = 1;
 }
 
 // Source of learning rate to use.
 message LearningRate {
   oneof learning_rate {
     float constant = 1;
-    DynamicLearningRate dynamic = 2;
+    OptimizerDynamicInput dynamic = 2;
   }
 }
 
@@ -131,6 +126,53 @@ message BoundedAdagradParameters {
   float max_accumulator = 3;
 }
 
+// Frequency Aware Adagrad optimizer. This optimizer implements the AdaGrad
+// algorithm and further allows to:
+// * Scale the learning rate based on frequency of the update. Sparsely updated
+//   rows are updated with a higher effective learning rate, and frequently
+//   updated rows are updated with a lower effective learning rate.
+// * Decay the growth of the accumulator values.
+// * Use L1 / L2 regularization for the weight updates.
+//
+// The optimization algorithm is shown below.
+// counter(new) = counter(old) + 1
+// accum(new) = max(accumulator_decay * accum(old) + grad^2,
+//                  initial_accumulator_value)
+// lr_scale = min((step_counter / accum(new)) ^ probability_exponent,
+// max_lr_multiplier) update = grad * lr_scale / sqrt(accum(new)) if
+// (l1_regularization_strength > 0.0):
+//   update = update + l1_regularization_strength * sign(var(old))
+// if (l2_regularization_strength > 0.0):
+//   update = update + l2_regularization_strength * var(old)
+// var(new) = var(old) - lr_scale * grad * update
+
+message FrequencyAwareAdagradParameters {
+  // The L1 regularization parameter for adjusting the update based on the sign
+  // of the variable.
+  float l1_regularization_strength = 1;
+
+  // The L2 regularization parameter for adjusting the update based on the
+  // variable.
+  float l2_regularization_strength = 2;
+
+  // The exponent used for scaling the learning rate based on the sparsity of
+  // updates.
+  float probability_exponent = 4;
+
+  // The maximum value of the learning rate scale.
+  float max_lr_multiplier = 3;
+
+  // The decay for the Adagrad accumulator.
+  float accumulator_decay = 5;
+
+  // The initial and minimum value for the Adagrad accumulator.
+  float initial_accumulator_value = 6;
+
+  // The tag for identifying the step counter used for the frequency aware
+  // Adagrad optimizer.
+  OptimizerDynamicInput step_counter = 7;
+}
+
 // https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/SGD
 // https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L629
 message StochasticGradientDescentParameters {}
@@ -502,7 +544,6 @@ message HotIdReplicationConfiguration {
 message OptimizationParameters {
   // Learning rate used for updating the embedding layer parameters.
   LearningRate learning_rate = 13;
-  reserved 1;  // Old learning rate tag.
 
   // Limits to which to clip the weight values after the backward pass; not
   // present means no limits are applied.
@@ -550,6 +591,7 @@ message OptimizationParameters {
     AdagradParameters adagrad = 3;
     AdagradMomentumParameters adagrad_momentum = 26;
     BoundedAdagradParameters bounded_adagrad = 19;
+    FrequencyAwareAdagradParameters frequency_aware_adagrad = 30;
     StochasticGradientDescentParameters stochastic_gradient_descent = 4;
     FtrlParameters ftrl = 5;
     AdamParameters adam = 6;
@@ -567,9 +609,9 @@ message OptimizationParameters {
     AssignParameters assign = 25;
   }
 
-  reserved 15;  // Old use_gradient_accumulation.
+  reserved 1, 15;
 
-  // NEXT_ID: 30
+  // NEXT_ID: 31
 }
 
 // Specification of an optimization algorithm's state variables (both the main
 
@@ -25,8 +25,10 @@ cc_library(
     hdrs = ["tpu_embedding_configuration_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":tpu_embedding_optimization_parameters_utils",
         "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_cc",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -72,6 +74,7 @@ cc_library(
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_cc",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_xla//xla:xla_data_proto_cc",
 
@@ -398,6 +398,7 @@ cc_library(
     srcs = ["tpu_embedding_software_deduplication_rewrite_pass.cc"],
     hdrs = ["tpu_embedding_software_deduplication_rewrite_pass.h"],
     deps = [
+        ":tpu_embedding_rewrite_pass_utils",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -406,7 +407,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
         "//tensorflow/core/tpu:tpu_embedding_configuration_utils",
-        "//tensorflow/core/tpu/graph_rewrite:tpu_embedding_rewrite_pass_utils",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",