diff --git a/examples/multibody/deformable/mpm_cloth.cc b/examples/multibody/deformable/mpm_cloth.cc
index 8f7f9263b50d..8ea9d3fa5df1 100644
--- a/examples/multibody/deformable/mpm_cloth.cc
+++ b/examples/multibody/deformable/mpm_cloth.cc
@@ -55,7 +55,8 @@ DEFINE_string(contact_approximation, "sap",
 // NOTE (changyu): here we choose k=100 for smaller amount of penetration (0.01mm or 1e-5m).
 DEFINE_double(stiffness, 10.0, "Contact Stiffness.");
 DEFINE_double(friction, 0.0, "Contact Friction.");
-DEFINE_double(damping, 1e-5,
+DEFINE_double(margin, 1.0, "MPM-Rigid Margin.");
+DEFINE_double(damping, 1e-2,
     "Hunt and Crossley damping for the deformable body, only used when "
     "'contact_approximation' is set to 'lagged' or 'similar' [s/m].");
 DEFINE_bool(exact_line_search, false, "Enable exact_line_search for contact solving.");
@@ -207,11 +208,23 @@ int do_main() {
     std::vector<int> indices;
     for (int i = 0; i < length; ++i) {
       for (int j = 0; j < width; ++j) {
-        double z = FLAGS_testcase == 100 ? 0.051 : (FLAGS_testcase == 3? 0.26 : 0.3 + k * 0.1);
+        double z = (FLAGS_testcase == 100 || FLAGS_testcase == 111) ? 0.051 : (FLAGS_testcase == 3? 0.26 : 0.3 + k * 0.1);
         if (FLAGS_testcase == 2) z = 0.27;
         if (FLAGS_testcase == 5) z = 0.18;
-        inital_pos.emplace_back((0.5 - 0.5 * l) + i * dx + k * 0.01, (0.5 - 0.5 * l) + j * dx + k * 0.01, z);
-        inital_vel.emplace_back(0., 0., 0.);
+        double x = (0.5 - 0.5 * l) + i * dx + k * 0.01;
+        double y = (0.5 - 0.5 * l) + j * dx + k * 0.01;
+        inital_pos.emplace_back(x, y, z);
+
+        if (FLAGS_testcase == 111 || FLAGS_testcase == 222) {
+          double omega = 50.0;
+          double vx = -omega * (y - 0.5);
+          double vy = omega * (x - 0.5);
+          double vz = 0.0;
+
+          inital_vel.emplace_back(vx, vy, vz);
+        } else {
+          inital_vel.emplace_back(0., 0., 0.);
+        }
       }
     }
 
@@ -239,6 +252,7 @@ int do_main() {
   mpm_config.contact_damping = FLAGS_damping;
   mpm_config.contact_friction_mu = FLAGS_friction;
   mpm_config.exact_line_search = FLAGS_exact_line_search;
+  mpm_config.margin = FLAGS_margin;
   deformable_model.SetMpmConfig(std::move(mpm_config));
 
   /* All rigid and deformable models have been added. Finalize the plant. */
@@ -301,7 +315,7 @@ int do_main() {
       meshcat->StopRecording();
       meshcat->PublishRecording();
 
-      std::ofstream htmlFile("/home/changyu/Desktop/cloth.html");
+      std::ofstream htmlFile("/home/changyu/drake/cloth.html");
       htmlFile << meshcat->StaticHtml();
       htmlFile.close();
   }
diff --git a/multibody/gpu_mpm/cpu_mpm_model.h b/multibody/gpu_mpm/cpu_mpm_model.h
index d6e6c183e70e..4053f9f24679 100644
--- a/multibody/gpu_mpm/cpu_mpm_model.h
+++ b/multibody/gpu_mpm/cpu_mpm_model.h
@@ -23,6 +23,7 @@ struct MpmConfigParams {
     int contact_query_frequency{1};
     int mpm_bc{-1};
     bool exact_line_search {false};
+    T margin{static_cast<T>(1.0)};
 };
 
 // NOTE(changyu): `CpuMpmModel` is responsive to store the initial config in `DeformableModel`,
diff --git a/multibody/gpu_mpm/cuda_mpm_kernels.cuh b/multibody/gpu_mpm/cuda_mpm_kernels.cuh
index 3af64ae37797..f66309d6bc44 100644
--- a/multibody/gpu_mpm/cuda_mpm_kernels.cuh
+++ b/multibody/gpu_mpm/cuda_mpm_kernels.cuh
@@ -69,6 +69,10 @@ __global__ void initialize_fem_state_kernel(
     }
 }
 
+/**
+   \param[in] F \in 2x2
+   \param[out] dphi_dF \in 2x2
+*/
 template<typename T>
 __device__ __host__
 inline void fixed_corotated_PK1_2D(const T* F, T* dphi_dF) {
@@ -77,17 +81,55 @@ inline void fixed_corotated_PK1_2D(const T* F, T* dphi_dF) {
     T R[4];
     matmulT<2, 2, 2, T>(U, V, R);
     T J = determinant2(F);
-    T Finv[4];
-    inverse2(F, Finv);
-    dphi_dF[0] = T(2.) * config::MU<T> * (F[0] - R[0]) + config::LAMBDA<T> * (J - T(1.)) * J * Finv[0];
-    dphi_dF[1] = T(2.) * config::MU<T> * (F[1] - R[1]) + config::LAMBDA<T> * (J - T(1.)) * J * Finv[2];
-    dphi_dF[2] = T(2.) * config::MU<T> * (F[2] - R[2]) + config::LAMBDA<T> * (J - T(1.)) * J * Finv[1];
-    dphi_dF[3] = T(2.) * config::MU<T> * (F[3] - R[3]) + config::LAMBDA<T> * (J - T(1.)) * J * Finv[3];
+    T JFinvT[4];
+    cofactor_matrix_2x2(F, JFinvT);
+
+    // P.noalias() = (T)2 * mu * (s.F - s.R) + lambda * (s.J - 1) * s.JFinvT;
+    dphi_dF[0] = T(2.) * config::MU<T> * (F[0] - R[0]) + config::LAMBDA<T> * (J - T(1.)) * JFinvT[0];
+    dphi_dF[1] = T(2.) * config::MU<T> * (F[1] - R[1]) + config::LAMBDA<T> * (J - T(1.)) * JFinvT[1];
+    dphi_dF[2] = T(2.) * config::MU<T> * (F[2] - R[2]) + config::LAMBDA<T> * (J - T(1.)) * JFinvT[2];
+    dphi_dF[3] = T(2.) * config::MU<T> * (F[3] - R[3]) + config::LAMBDA<T> * (J - T(1.)) * JFinvT[3];
 }
 
+/**
+   \param[in] F \in 2x2
+   \param[in] dF \in 2x2
+   \param[out] dP \in 2x2
+*/
 template<typename T>
 __device__ __host__
-inline void compute_dphi_dF(const T* F, T* dphi_dF) {
+inline void fixed_corotated_PK1_differential_2D(const T *F, const T* dF, T* dP) {
+    T U[4], sig[4], V[4];
+    svd2x2(F, U, sig, V);
+    T R[4], S[4], tmp[4];
+    matmulT<2, 2, 2, T>(U, V, R);
+    matmul<2, 2, 2, T>(V, sig, tmp);
+    matmulT<2, 2, 2, T>(tmp, V, S);
+    T J = determinant2(F);
+    T JFinvT[4];
+    cofactor_matrix_2x2(F, JFinvT);
+
+    // dP.noalias() = lambda * s.JFinvT.cwiseProduct(dF).sum() * s.JFinvT;
+    T JFinvT_cwiseProduct_dF_sum = JFinvT[0] * dF[0] + JFinvT[1] * dF[1] + JFinvT[2] * dF[2] + JFinvT[3] * dF[3];
+    for (int i = 0; i < 4; ++i) dP[i] = config::LAMBDA<T> * JFinvT_cwiseProduct_dF_sum * JFinvT[i];
+
+    // dP += 2 * mu * dF;
+    for (int i = 0; i < 4; ++i) dP[i] += T(2.) * config::MU<T> * dF[i];
+
+    // addScaledRotationalDifferential(s.R, s.S, dF, -2 * mu, dP);
+    add_scaled_rotational_differential_2x2(R, S, dF, -T(2.) * config::MU<T>, dP);
+
+    // addScaledCofactorMatrixDifferential(s.F, dF, lambda * (s.J - (T)1), dP);
+    add_scaled_cofactor_matrix_differential_2x2(F, dF, config::LAMBDA<T> * (J - T(1.)), dP);
+}
+
+/**
+   \param[in] F \in 3x3
+   \param[out] dphi_dF \in 3x3
+*/
+template<typename T>
+__device__ __host__
+inline void first_piola(const T* F, T* dphi_dF) {
     // A00=0, A01=1, A02=2
     // A10=3, A11=4, A12=5
     // A20=6, A21=7, A22=8
@@ -153,6 +195,145 @@ inline void compute_dphi_dF(const T* F, T* dphi_dF) {
     dphi_dF[8] = P_plane[8] + P_nonplane[8];
 }
 
+/**
+   \param[in] F \in 3x3
+   \param[in] dF \in 3x3
+   \param[out] dP \in 3x3
+*/
+// Phat = Phat(R)
+// P(F) = Q(F) Phat(R) = Q(F) Phat(R(F))
+// dP(F) = dQ(F) Phat(R(F)) + Q(F) dPhatdR(R(F)):dRdF(F):dF
+//       = dQ(F) Phat(R(F)) + Q(F) (dPhatdR(R(F)):dR)
+//       = dQ(F) Phat(R(F)) + Q(F) dPhat(R(F))
+template<typename T>
+__device__ __host__
+inline void first_piola_differential(const T* F, const T* dF, T* dP) {
+    // updateScratch(const TM& new_F, Scratch& s) { ...
+    // A00=0, A01=1, A02=2
+    // A10=3, A11=4, A12=5
+    // A20=6, A21=7, A22=8
+    T Q[9], R[9];
+    givens_QR<3, 3, T>(F, Q, R);
+    T R_hat[4] = {
+        R[0], R[1],
+        R[3], R[4]
+    };
+    T dphi_dF_2x2[4];
+    fixed_corotated_PK1_2D(R_hat, dphi_dF_2x2);
+    T P_hat[9] = {
+        dphi_dF_2x2[0], dphi_dF_2x2[1], 0,
+        dphi_dF_2x2[2], dphi_dF_2x2[3], 0,
+        0             ,              0, 0
+    };
+    T P_plane[9];
+    matmul<3, 3, 3, T>(Q, P_hat, P_plane);
+
+    T rr = R[2] * R[2] + R[5] * R[5];
+    T g = config::GAMMA<T> * rr;
+    T gp = config::GAMMA<T>;
+    T fp = 0;
+    if (R[8] < T(1.)) {
+        fp = -config::K<T> * (T(1.) - R[8]) * (T(1.) - R[8]);
+    }
+
+    T A[9];
+    // TM A = TM::Zero();
+    // A(0, 0) = gp * sqr(s.R(0, 2));
+    // A(0, 1) = gp * s.R(0, 2) * s.R(1, 2);
+    // A(0, 2) = gp * s.R(2, 2) * s.R(0, 2);
+    // A(1, 1) = gp * sqr(s.R(1, 2));
+    // A(1, 2) = gp * s.R(2, 2) * s.R(1, 2);
+    // A(2, 2) = fp * s.R(2, 2);
+    // A(1, 0) = A(0, 1);
+    // A(2, 0) = A(0, 2);
+    // A(2, 1) = A(1, 2);
+    A[0] = gp * R[2] * R[2];
+    A[1] = gp * R[2] * R[5];
+    A[2] = gp * R[8] * R[2];
+    A[4] = gp * R[5] * R[5];
+    A[5] = gp * R[8] * R[5];
+    A[8] = fp * R[8];
+    A[3] = A[1];
+    A[6] = A[2];
+    A[7] = A[5];
+
+    T P_nonplane[9];
+    T Rinv[9], QA[9];
+    inverse3(R, Rinv);
+    matmul<3, 3, 3, T>(Q, A, QA);
+    matmulT<3, 3, 3, T>(QA, Rinv, P_nonplane);
+    // ... }
+
+    // TM dQ, dR;
+    // EIGEN_EXT::QRDifferential(s.Q, s.R, dF, dQ, dR);
+    T dQ[9], dR[9];
+    QR_differential_3x3(Q, R, dF, dQ, dR);
+
+    // Matrix<T, 2, 2> dPhat;
+    // corotated.firstPiolaDifferential(s.corotated_scratch, dR.template topLeftCorner<2, 2>(), dPhat);
+    T dPhat[4];
+    T dR_hat[4] = {
+        dR[0], dR[1],
+        dR[3], dR[4]
+    };
+    fixed_corotated_PK1_differential_2D(R_hat, dR_hat, dPhat);
+
+    // TM dPhat_full = TM::Zero();
+    T dPhat_full[9];
+    for (int i = 0; i < 9; ++i) dPhat_full[i] = 0;
+
+    // dPhat_full.template topLeftCorner<2, 2>() = dPhat;
+    dPhat_full[0] = dPhat[0];
+    dPhat_full[1] = dPhat[1];
+    dPhat_full[3] = dPhat[2];
+    dPhat_full[4] = dPhat[3];
+
+    // dP = s.Q * dPhat_full + dQ * s.Phat;
+    T QdPhat_full[9], dQPhat[9];
+    matmul<3, 3, 3, T>(Q, dPhat_full, QdPhat_full);
+    matmul<3, 3, 3, T>(dQ, P_hat, dQPhat);
+    for (int i = 0; i < 9; ++i) dP[i] = QdPhat_full[i] + dQPhat[i];
+
+    // TM dA;
+    T dA[9];
+
+    // T gp = gamma; // dgp = 0;
+    // T fp = 0, dfp = 0;
+    // if (s.R(2, 2) < (T)1) {
+    //     T zz = 1 - s.R(2, 2);
+    //     fp = -k * sqr(zz);
+    //     dfp = 2 * k * zz * dR(2, 2);
+    // }
+    T dfp = 0; // NOTE: others defined above
+    if (R[8] < T(1.)) {
+        T zz = T(1.) - R[8];
+        dfp = T(2.) * config::K<T> * zz * dR[8];
+    }
+
+    dA[0] = gp * T(2.) * R[2] * dR[2];
+    dA[1] = gp * (dR[2] * R[5] + R[2] * dR[5]);
+    dA[2] = gp * (dR[8] * R[2] + R[8] * dR[2]);
+    dA[4] = gp * T(2.) * R[5] * dR[5];
+    dA[5] = gp * (dR[8] * R[5] + R[8] * dR[5]);
+    dA[8] = dfp * R[8] + fp * dR[8];
+    dA[3] = dA[1];
+    dA[6] = dA[2];
+    dA[7] = dA[5];
+    
+    // dP += (s.Q * dA - s.P_nonplane * dR.transpose()) * s.R_inverse.transpose() - s.Q * dQ.transpose() * s.P_nonplane;
+    T Q_dA[9], P_nonplane_dRT[9], Q_dA_minus_P_nonplane_dRt[9], Q_dA_minus_P_nonplane_dRt_RinvT[9];
+    matmul<3, 3, 3, T>(Q, dA, Q_dA);
+    matmulT<3, 3, 3, T>(P_nonplane, dR, P_nonplane_dRT);
+    for (int i = 0; i < 9; ++i) Q_dA_minus_P_nonplane_dRt[i] = Q_dA[i] -  P_nonplane_dRT[i];
+    matmulT<3, 3, 3, T>(Q_dA_minus_P_nonplane_dRt, Rinv, Q_dA_minus_P_nonplane_dRt_RinvT);
+
+    T Q_dQT[9], Q_dQT_P_nonplane[9];
+    matmulT<3, 3, 3, T>(Q, dQ, Q_dQT);
+    matmul<3, 3, 3, T>(Q_dQT, P_nonplane, Q_dQT_P_nonplane);
+
+    for (int i = 0; i < 9; ++i) dP[i] += Q_dA_minus_P_nonplane_dRt_RinvT[i] - Q_dQT_P_nonplane[i];
+}
+
 template<typename T>
 __device__ __host__
 inline void project_strain(T* F) {
@@ -190,13 +371,14 @@ inline void project_strain(T* F) {
     matmul<3, 3, 3, T>(Q, R, F);
 }
 
-template<typename T>
+template<typename T, bool POST_CONTACT>
 __global__ void calc_fem_state_and_force_kernel(
     const size_t n_faces,
     const int* indices,
     const int* index_mappings,
     const T* volumes,
     const T* affine_matrices,
+    const T* affine_matrices_star,
     const T* Dm_inverses,
     T* positions, 
     T* velocities,
@@ -217,7 +399,18 @@ __global__ void calc_fem_state_and_force_kernel(
         }
 
         T* F = &deformation_gradients[idx * 9];
-        const T* C = &affine_matrices[face_pid * 9];
+        T C[9];
+        if constexpr(POST_CONTACT) {
+            #pragma unroll
+            for (int i = 0; i < 9; ++i) {
+                C[i] = (affine_matrices[face_pid * 9 + i] - affine_matrices_star[face_pid * 9 + i]);
+            }
+        } else {
+            #pragma unroll
+            for (int i = 0; i < 9; ++i) {
+                C[i] = affine_matrices[face_pid * 9 + i];
+            }
+        }
         T ctF[9]; // cotangent F
 
         // Eq.4 in Jiang et.al 2017, dE_p, β(x̂) = (∇x̂)p dE,n_p, β
@@ -265,7 +458,7 @@ __global__ void calc_fem_state_and_force_kernel(
         }
 
         T VP_local[9];
-        compute_dphi_dF(ctF, VP_local);
+        first_piola(ctF, VP_local);
         #pragma unroll
         for (int i = 0; i < 9; ++i) {
             VP_local[i] *= volumes[face_pid];
@@ -303,6 +496,98 @@ __global__ void calc_fem_state_and_force_kernel(
     }
 }
 
+template<typename T>
+__global__ void calc_implicit_fem_state_and_differential_kernel(
+    const size_t n_faces,
+    const int* indices,
+    const int* index_mappings,
+    const T* volumes,
+    const T* Dm_inverses,
+    const T* deformation_gradients,
+    
+    // implicit states
+    const T* dvs,
+    const T* gradDvs,
+    T *dforces,
+    T* dtaus) {
+    uint32_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+    int face_pid = index_mappings[idx];
+    if (idx < n_faces) {
+        int v0 = index_mappings[indices[idx * 3 + 0]];
+        int v1 = index_mappings[indices[idx * 3 + 1]];
+        int v2 = index_mappings[indices[idx * 3 + 2]];
+
+        const T* F_n = &deformation_gradients[idx * 9];
+        const T* gradDv = &gradDvs[face_pid * 9];
+
+        // tangent_dF
+        T d0[3], d1[3];
+        #pragma unroll
+        for (int i = 0; i < 3; ++i) {
+            d0[i] = dvs[v1 * 3 + i] - dvs[v0 * 3 + i];
+            d1[i] = dvs[v2 * 3 + i] - dvs[v0 * 3 + i];
+        }
+        T ds[6] {
+            d0[0], d1[0],
+            d0[1], d1[1],
+            d0[2], d1[2]
+        };
+
+        T tangent_dF[6];
+        const T* Dm_inverse = &Dm_inverses[idx * 4];
+        matmul<3, 2, 2>(ds, Dm_inverse, tangent_dF);
+
+        // cotangent_dF
+        T ctF_n_c2[3] = { F_n[2], F_n[5], F_n[8] };
+        T ct_dF[3];
+        matmul<3, 3, 1, T>(gradDv, ctF_n_c2, ct_dF);
+
+        // dF
+        // dF << t_dF, ct_dF;
+        T dF[9] = {
+            tangent_dF[0], tangent_dF[1], ct_dF[0],
+            tangent_dF[2], tangent_dF[3], ct_dF[1],
+            tangent_dF[4], tangent_dF[5], ct_dF[2]
+        };
+
+        T VdP_local[9];
+        first_piola_differential(F_n, dF, VdP_local);
+        #pragma unroll
+        for (int i = 0; i < 9; ++i) {
+            VdP_local[i] *= volumes[face_pid];
+        }
+
+        // technical document .(15) part 2
+        T VdP_local_c2[3] = { VdP_local[2], VdP_local[5], VdP_local[8] };
+        outer_product<3, T>(VdP_local_c2, ct_dF, &dtaus[face_pid * 9]);
+
+        T grad_N_hat[6] = {
+            T(-1.), T(1.), T(0.),
+            T(-1.), T(0.), T(1.)
+        };
+        T grad_N[6];
+        T Dm_inverse_T[4];
+        transpose<2, 2, T>(Dm_inverse, Dm_inverse_T);
+        matmul<2, 2, 3, T>(Dm_inverse_T, grad_N_hat, grad_N);
+        T VdP_local_c01[6] = { 
+            VdP_local[0], VdP_local[1],
+            VdP_local[3], VdP_local[4],
+            VdP_local[6], VdP_local[7]
+        };
+
+        T G[9];
+        matmul<3, 2, 3, T>(VdP_local_c01, grad_N, G);
+
+        // technical document .(15) part 1
+        #pragma unroll
+        for (int i = 0; i < 3; ++i) {
+            atomicAdd(&dforces[v0 * 3 + i], -G[i * 3 + 0]);
+            atomicAdd(&dforces[v1 * 3 + i], -G[i * 3 + 1]);
+            atomicAdd(&dforces[v2 * 3 + i], -G[i * 3 + 2]);
+        }
+    }
+}
+
 __device__ __host__
 inline std::uint32_t contract_bits(std::uint32_t v) noexcept {
     v &= 0x09249249u;
@@ -425,7 +710,7 @@ __global__ void compute_sorted_state_kernel(const size_t n_particles,
     }
 }
 
-template<typename T, int BLOCK_DIM>
+template<typename T, int BLOCK_DIM, bool APPLY_GRAVITY=true, bool APPLY_ELASTICITY=true, bool APPLY_KDV=false>
 __global__ void particle_to_grid_kernel(const size_t n_particles,
     const T* positions, 
     const T* velocities,
@@ -490,10 +775,17 @@ __global__ void particle_to_grid_kernel(const size_t n_particles,
 
         T B[9];
         const T* C = &affine_matrices[idx * 9];
-        const T* stress = &taus[idx * 9];
         #pragma unroll
         for (int i = 0; i < 9; ++i) {
-            B[i] = (-dt * config::G_D_INV<T>) * stress[i] + C[i] * mass;
+            if constexpr (APPLY_KDV) {
+                const T* dstress = &taus[idx * 9];
+                B[i] = (-config::G_D_INV<T>) * dstress[i];
+            } else if constexpr (APPLY_ELASTICITY) {
+                const T* stress = &taus[idx * 9];
+                B[i] = (-dt * config::G_D_INV<T>) * stress[i] + C[i] * mass;
+            } else {
+                B[i] = C[i] * mass;
+            }
         }
 
         T val[4];
@@ -512,20 +804,37 @@ __global__ void particle_to_grid_kernel(const size_t n_particles,
 
                     T weight = weights[threadIdx.x][i][0] * weights[threadIdx.x][j][1] * weights[threadIdx.x][k][2];
 
-                    val[0] = mass * weight;
-                    val[1] = vel[0] * val[0];
-                    val[2] = vel[1] * val[0];
-                    val[3] = vel[2] * val[0];
+                    if constexpr (!APPLY_KDV) {
+                        val[0] = mass * weight;
+                        val[1] = vel[0] * val[0];
+                        val[2] = vel[1] * val[0];
+                        val[3] = vel[2] * val[0];
+                    } else {
+                        val[0] = 0;
+                        val[1] = 0;
+                        val[2] = 0;
+                        val[3] = 0;
+                    }
                     // apply gravity
-                    val[config::GRAVITY_AXIS + 1] += val[0] * config::GRAVITY<T> * dt;
+                    if constexpr (APPLY_GRAVITY) {
+                        val[config::GRAVITY_AXIS + 1] += val[0] * config::GRAVITY<T> * dt;
+                    }
 
                     val[1] += (B[0] * xi_minus_xp[0] + B[1] * xi_minus_xp[1] + B[2] * xi_minus_xp[2]) * weight;
                     val[2] += (B[3] * xi_minus_xp[0] + B[4] * xi_minus_xp[1] + B[5] * xi_minus_xp[2]) * weight;
                     val[3] += (B[6] * xi_minus_xp[0] + B[7] * xi_minus_xp[1] + B[8] * xi_minus_xp[2]) * weight;
-                    const T* force = &forces[idx * 3];
-                    val[1] += force[0] * dt * weight;
-                    val[2] += force[1] * dt * weight;
-                    val[3] += force[2] * dt * weight;
+                    if constexpr (APPLY_KDV) {
+                        const T* dforce = &forces[idx * 3];
+                        val[1] += dforce[0] * weight;
+                        val[2] += dforce[1] * weight;
+                        val[3] += dforce[2] * weight;
+                    }
+                    if constexpr (APPLY_ELASTICITY) {
+                        const T* force = &forces[idx * 3];
+                        val[1] += force[0] * dt * weight;
+                        val[2] += force[1] * dt * weight;
+                        val[3] += force[2] * dt * weight;
+                    }
 
                     for (int iter = 1; iter <= mark; iter <<= 1) {
                         T tmp[4]; 
@@ -541,7 +850,9 @@ __global__ void particle_to_grid_kernel(const size_t n_particles,
                         const uint32_t target_cell_index = cell_index(base[0] + i, base[1] + j, base[2] + k);
                         const uint32_t target_grid_index = target_cell_index >> (config::G_BLOCK_BITS * 3);
                         g_touched_flags[target_grid_index] = 1;
-                        atomicAdd(&(g_masses[target_cell_index]), val[0]);
+                        if constexpr (!APPLY_KDV) {
+                            atomicAdd(&(g_masses[target_cell_index]), val[0]);
+                        }
                         atomicAdd(&(g_momentum[target_cell_index * 3 + 0]), val[1]);
                         atomicAdd(&(g_momentum[target_cell_index * 3 + 1]), val[2]);
                         atomicAdd(&(g_momentum[target_cell_index * 3 + 2]), val[3]);
@@ -639,6 +950,20 @@ __global__ void clean_grid_contact_kernel(
     }
 }
 
+template<typename T>
+__global__ void clean_grid_Kdv_kernel(
+    const uint32_t touched_cells_cnt,
+    const uint32_t* g_touched_ids,
+    T* g_Kdv) {
+    uint32_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+    if (idx < touched_cells_cnt) {
+        uint32_t block_idx = g_touched_ids[idx >> (config::G_BLOCK_BITS * 3)];
+        uint32_t cell_idx = (block_idx << (config::G_BLOCK_BITS * 3)) | (idx & config::G_BLOCK_VOLUME_MASK);
+        #pragma unroll
+        for (int i = 0; i < 3; ++i) g_Kdv[cell_idx * 3 + i] = 0;
+    }
+}
+
 template<typename T, int MPM_BOUNDARY_CONDITION=-1>
 __global__ void update_grid_kernel(
     const uint32_t touched_cells_cnt,
@@ -805,13 +1130,15 @@ __global__ void update_grid_kernel(
     }
 }
 
-template<typename T, int BLOCK_DIM, bool CONTACT_TRANSFER>
+template<typename T, int BLOCK_DIM, bool CONTACT_TRANSFER, bool POST_CONTACT>
 __global__ void grid_to_particle_kernel(const size_t n_particles,
     T* positions, 
     T* velocities,
     T* affine_matrices,
+    T* affine_matrices_star,
     const T* g_masses,
     const T* g_momentum,
+    const T* g_v_star,
     const T dt) {
     uint32_t idx = threadIdx.x + blockDim.x * blockIdx.x;
     // In [Fei et.al 2021],
@@ -839,15 +1166,19 @@ __global__ void grid_to_particle_kernel(const size_t n_particles,
             weights[threadIdx.x][2][i] = T(0.5) * (fx[i] - T(0.5)) * (fx[i] - T(0.5));
         }
 
+        T old_v[3];
         T new_v[3];
         T new_C[9], new_CT[9];
+        T old_C[9], old_CT[9];
         #pragma unroll
         for (int i = 0; i < 3; ++i) {
             new_v[i] = 0;
+            old_v[i] = 0;
         }
         #pragma unroll
         for (int i = 0; i < 9; ++i) {
             new_C[i] = 0;
+            old_C[i] = 0;
         }
 
         #pragma unroll
@@ -875,6 +1206,11 @@ __global__ void grid_to_particle_kernel(const size_t n_particles,
                         new_v[0] += weight * g_v[0];
                         new_v[1] += weight * g_v[1];
                         new_v[2] += weight * g_v[2];
+                        if constexpr (POST_CONTACT) {
+                            old_v[0] += weight * g_v_star[target_cell_index * 3 + 0];
+                            old_v[1] += weight * g_v_star[target_cell_index * 3 + 1];
+                            old_v[2] += weight * g_v_star[target_cell_index * 3 + 2];
+                        }
                         // printf("weight=%lf, g_v=(%lf %lf %lf)\n", weight, g_v[0], g_v[1], g_v[2]);
 
                         // printf("i=%d j=%d k=%d\n", i, j, k);
@@ -890,6 +1226,17 @@ __global__ void grid_to_particle_kernel(const size_t n_particles,
                         new_C[6] += 4 * config::G_DX_INV<T> * weight * g_v[2] * xi_minus_xp[0];
                         new_C[7] += 4 * config::G_DX_INV<T> * weight * g_v[2] * xi_minus_xp[1];
                         new_C[8] += 4 * config::G_DX_INV<T> * weight * g_v[2] * xi_minus_xp[2];
+                        if constexpr (POST_CONTACT) {
+                            old_C[0] += 4 * config::G_DX_INV<T> * weight * g_v_star[target_cell_index * 3 + 0] * xi_minus_xp[0];
+                            old_C[1] += 4 * config::G_DX_INV<T> * weight * g_v_star[target_cell_index * 3 + 0] * xi_minus_xp[1];
+                            old_C[2] += 4 * config::G_DX_INV<T> * weight * g_v_star[target_cell_index * 3 + 0] * xi_minus_xp[2];
+                            old_C[3] += 4 * config::G_DX_INV<T> * weight * g_v_star[target_cell_index * 3 + 1] * xi_minus_xp[0];
+                            old_C[4] += 4 * config::G_DX_INV<T> * weight * g_v_star[target_cell_index * 3 + 1] * xi_minus_xp[1];
+                            old_C[5] += 4 * config::G_DX_INV<T> * weight * g_v_star[target_cell_index * 3 + 1] * xi_minus_xp[2];
+                            old_C[6] += 4 * config::G_DX_INV<T> * weight * g_v_star[target_cell_index * 3 + 2] * xi_minus_xp[0];
+                            old_C[7] += 4 * config::G_DX_INV<T> * weight * g_v_star[target_cell_index * 3 + 2] * xi_minus_xp[1];
+                            old_C[8] += 4 * config::G_DX_INV<T> * weight * g_v_star[target_cell_index * 3 + 2] * xi_minus_xp[2];
+                        }
                     }
                 }
             }
@@ -905,20 +1252,49 @@ __global__ void grid_to_particle_kernel(const size_t n_particles,
             velocities[idx * 3 + 2] = new_v[2];
 
             transpose<3, 3, T>(new_C, new_CT);
-            affine_matrices[idx * 9 + 0] = ((config::V<T> + T(1.)) * T(.5)) * new_C[0] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[0];
-            affine_matrices[idx * 9 + 1] = ((config::V<T> + T(1.)) * T(.5)) * new_C[1] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[1];
-            affine_matrices[idx * 9 + 2] = ((config::V<T> + T(1.)) * T(.5)) * new_C[2] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[2];
-            affine_matrices[idx * 9 + 3] = ((config::V<T> + T(1.)) * T(.5)) * new_C[3] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[3];
-            affine_matrices[idx * 9 + 4] = ((config::V<T> + T(1.)) * T(.5)) * new_C[4] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[4];
-            affine_matrices[idx * 9 + 5] = ((config::V<T> + T(1.)) * T(.5)) * new_C[5] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[5];
-            affine_matrices[idx * 9 + 6] = ((config::V<T> + T(1.)) * T(.5)) * new_C[6] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[6];
-            affine_matrices[idx * 9 + 7] = ((config::V<T> + T(1.)) * T(.5)) * new_C[7] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[7];
-            affine_matrices[idx * 9 + 8] = ((config::V<T> + T(1.)) * T(.5)) * new_C[8] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[8];
-
             // Advection
-            positions[idx * 3 + 0] += new_v[0] * dt;
-            positions[idx * 3 + 1] += new_v[1] * dt;
-            positions[idx * 3 + 2] += new_v[2] * dt;
+            if constexpr (POST_CONTACT) {
+                transpose<3, 3, T>(old_C, old_CT);
+
+                affine_matrices[idx * 9 + 0] = ((config::V<T> + T(1.)) * T(.5)) * new_C[0] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[0];
+                affine_matrices[idx * 9 + 1] = ((config::V<T> + T(1.)) * T(.5)) * new_C[1] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[1];
+                affine_matrices[idx * 9 + 2] = ((config::V<T> + T(1.)) * T(.5)) * new_C[2] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[2];
+                affine_matrices[idx * 9 + 3] = ((config::V<T> + T(1.)) * T(.5)) * new_C[3] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[3];
+                affine_matrices[idx * 9 + 4] = ((config::V<T> + T(1.)) * T(.5)) * new_C[4] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[4];
+                affine_matrices[idx * 9 + 5] = ((config::V<T> + T(1.)) * T(.5)) * new_C[5] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[5];
+                affine_matrices[idx * 9 + 6] = ((config::V<T> + T(1.)) * T(.5)) * new_C[6] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[6];
+                affine_matrices[idx * 9 + 7] = ((config::V<T> + T(1.)) * T(.5)) * new_C[7] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[7];
+                affine_matrices[idx * 9 + 8] = ((config::V<T> + T(1.)) * T(.5)) * new_C[8] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[8];
+
+                affine_matrices_star[idx * 9 + 0] = ((config::V<T> + T(1.)) * T(.5)) * old_C[0] + ((config::V<T> - T(1.)) * T(.5)) * old_CT[0];
+                affine_matrices_star[idx * 9 + 1] = ((config::V<T> + T(1.)) * T(.5)) * old_C[1] + ((config::V<T> - T(1.)) * T(.5)) * old_CT[1];
+                affine_matrices_star[idx * 9 + 2] = ((config::V<T> + T(1.)) * T(.5)) * old_C[2] + ((config::V<T> - T(1.)) * T(.5)) * old_CT[2];
+                affine_matrices_star[idx * 9 + 3] = ((config::V<T> + T(1.)) * T(.5)) * old_C[3] + ((config::V<T> - T(1.)) * T(.5)) * old_CT[3];
+                affine_matrices_star[idx * 9 + 4] = ((config::V<T> + T(1.)) * T(.5)) * old_C[4] + ((config::V<T> - T(1.)) * T(.5)) * old_CT[4];
+                affine_matrices_star[idx * 9 + 5] = ((config::V<T> + T(1.)) * T(.5)) * old_C[5] + ((config::V<T> - T(1.)) * T(.5)) * old_CT[5];
+                affine_matrices_star[idx * 9 + 6] = ((config::V<T> + T(1.)) * T(.5)) * old_C[6] + ((config::V<T> - T(1.)) * T(.5)) * old_CT[6];
+                affine_matrices_star[idx * 9 + 7] = ((config::V<T> + T(1.)) * T(.5)) * old_C[7] + ((config::V<T> - T(1.)) * T(.5)) * old_CT[7];
+                affine_matrices_star[idx * 9 + 8] = ((config::V<T> + T(1.)) * T(.5)) * old_C[8] + ((config::V<T> - T(1.)) * T(.5)) * old_CT[8];
+
+                positions[idx * 3 + 0] += (new_v[0] - old_v[0]) * dt;
+                positions[idx * 3 + 1] += (new_v[1] - old_v[1]) * dt;
+                positions[idx * 3 + 2] += (new_v[2] - old_v[2]) * dt;
+            }
+            else {
+                affine_matrices[idx * 9 + 0] = ((config::V<T> + T(1.)) * T(.5)) * new_C[0] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[0];
+                affine_matrices[idx * 9 + 1] = ((config::V<T> + T(1.)) * T(.5)) * new_C[1] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[1];
+                affine_matrices[idx * 9 + 2] = ((config::V<T> + T(1.)) * T(.5)) * new_C[2] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[2];
+                affine_matrices[idx * 9 + 3] = ((config::V<T> + T(1.)) * T(.5)) * new_C[3] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[3];
+                affine_matrices[idx * 9 + 4] = ((config::V<T> + T(1.)) * T(.5)) * new_C[4] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[4];
+                affine_matrices[idx * 9 + 5] = ((config::V<T> + T(1.)) * T(.5)) * new_C[5] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[5];
+                affine_matrices[idx * 9 + 6] = ((config::V<T> + T(1.)) * T(.5)) * new_C[6] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[6];
+                affine_matrices[idx * 9 + 7] = ((config::V<T> + T(1.)) * T(.5)) * new_C[7] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[7];
+                affine_matrices[idx * 9 + 8] = ((config::V<T> + T(1.)) * T(.5)) * new_C[8] + ((config::V<T> - T(1.)) * T(.5)) * new_CT[8];
+
+                positions[idx * 3 + 0] += new_v[0] * dt;
+                positions[idx * 3 + 1] += new_v[1] * dt;
+                positions[idx * 3 + 2] += new_v[2] * dt;
+            }
 
             // printf("v=\n");
             // printf("[%.8lf   %.8lf   %.8lf]\n", new_v[0], new_v[1], new_v[2]);
@@ -930,6 +1306,122 @@ __global__ void grid_to_particle_kernel(const size_t n_particles,
     }
 }
 
+// NOTE(changyu): same as:
+// 
+// template <class T, int dim>
+// void MpmForceBase<T, dim>::computeDvAndGradDv(const TVStack& dv) const
+// {
+//     ZIRAN_QUIET_TIMER();
+//     evalInterpolantAndGradient([&](int node_id) -> TV { return dv.col(node_id); }, scratch_vp, scratch_gradV);
+// }
+// 
+template<typename T, int BLOCK_DIM>
+__global__ void grid_to_particle_dv_transfer_kernel(const size_t n_particles,
+    const T* positions, 
+    T* dvs,
+    T* gradDvs,
+    const T* g_momentum,
+    const T* g_Dir,
+    const T* g_v_star,
+    const T global_alpha,
+    const bool apply_dir) {
+    uint32_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+    // In [Fei et.al 2021],
+    // we spill the B-spline weights (nine floats for each thread) by storing them into the shared memory
+    // (instead of registers), although none of them is shared between threads. 
+    // This choice increases performance, particularly when the number of threads is large (§6.2.5).
+    __shared__ T weights[BLOCK_DIM][3][3];
+
+    if (idx < n_particles) {
+        uint32_t base[3] = {
+            static_cast<uint32_t>(positions[idx * 3 + 0] * config::G_DX_INV<T> - T(0.5)),
+            static_cast<uint32_t>(positions[idx * 3 + 1] * config::G_DX_INV<T> - T(0.5)),
+            static_cast<uint32_t>(positions[idx * 3 + 2] * config::G_DX_INV<T> - T(0.5))
+        };
+        T fx[3] = {
+            positions[idx * 3 + 0] * config::G_DX_INV<T> - static_cast<T>(base[0]),
+            positions[idx * 3 + 1] * config::G_DX_INV<T> - static_cast<T>(base[1]),
+            positions[idx * 3 + 2] * config::G_DX_INV<T> - static_cast<T>(base[2])
+        };
+        // Quadratic kernels  [http://mpm.graphics   Eqn. 123, with x=fx, fx-1,fx-2]
+        #pragma unroll
+        for (int i = 0; i < 3; ++i) {
+            weights[threadIdx.x][0][i] = T(0.5) * (T(1.5) - fx[i]) * (T(1.5) - fx[i]);
+            weights[threadIdx.x][1][i] = T(0.75) - (fx[i] - T(1.0)) * (fx[i] - T(1.0));
+            weights[threadIdx.x][2][i] = T(0.5) * (fx[i] - T(0.5)) * (fx[i] - T(0.5));
+        }
+
+        T dv[3];
+        T gradDv[9];
+        #pragma unroll
+        for (int i = 0; i < 3; ++i) {
+            dv[i] = 0;
+        }
+        #pragma unroll
+        for (int i = 0; i < 9; ++i) {
+            gradDv[i] = 0;
+        }
+
+        #pragma unroll
+        for (int i = 0; i < 3; ++i) {
+            #pragma unroll
+            for (int j = 0; j < 3; ++j) {
+                #pragma unroll
+                for (int k = 0; k < 3; ++k) {
+                    T xi_minus_xp[3] = {
+                        (i - fx[0]),
+                        (j - fx[1]),
+                        (k - fx[2])
+                    };
+
+                    const uint32_t target_cell_index = cell_index(base[0] + i, base[1] + j, base[2] + k);
+
+                    T weight = weights[threadIdx.x][i][0] * weights[threadIdx.x][j][1] * weights[threadIdx.x][k][2];
+
+                    T g_v[3];
+                    if (apply_dir) {
+                        g_v[0] = g_Dir[target_cell_index * 3 + 0];
+                        g_v[1] = g_Dir[target_cell_index * 3 + 1];
+                        g_v[2] = g_Dir[target_cell_index * 3 + 2];
+                    } else {
+                        g_v[0] = (g_momentum[target_cell_index * 3 + 0] + global_alpha * g_Dir[target_cell_index * 3 + 0]) - g_v_star[target_cell_index * 3 + 0];
+                        g_v[1] = (g_momentum[target_cell_index * 3 + 1] + global_alpha * g_Dir[target_cell_index * 3 + 1]) - g_v_star[target_cell_index * 3 + 1];
+                        g_v[2] = (g_momentum[target_cell_index * 3 + 2] + global_alpha * g_Dir[target_cell_index * 3 + 2]) - g_v_star[target_cell_index * 3 + 2];
+                    }
+
+                    dv[0] += weight * g_v[0];
+                    dv[1] += weight * g_v[1];
+                    dv[2] += weight * g_v[2];
+
+                    gradDv[0] += 4 * config::G_DX_INV<T> * weight * g_v[0] * xi_minus_xp[0];
+                    gradDv[1] += 4 * config::G_DX_INV<T> * weight * g_v[0] * xi_minus_xp[1];
+                    gradDv[2] += 4 * config::G_DX_INV<T> * weight * g_v[0] * xi_minus_xp[2];
+                    gradDv[3] += 4 * config::G_DX_INV<T> * weight * g_v[1] * xi_minus_xp[0];
+                    gradDv[4] += 4 * config::G_DX_INV<T> * weight * g_v[1] * xi_minus_xp[1];
+                    gradDv[5] += 4 * config::G_DX_INV<T> * weight * g_v[1] * xi_minus_xp[2];
+                    gradDv[6] += 4 * config::G_DX_INV<T> * weight * g_v[2] * xi_minus_xp[0];
+                    gradDv[7] += 4 * config::G_DX_INV<T> * weight * g_v[2] * xi_minus_xp[1];
+                    gradDv[8] += 4 * config::G_DX_INV<T> * weight * g_v[2] * xi_minus_xp[2];
+                }
+            }
+        }
+
+        dvs[idx * 3 + 0] = dv[0];
+        dvs[idx * 3 + 1] = dv[1];
+        dvs[idx * 3 + 2] = dv[2];
+
+        gradDvs[idx * 9 + 0] = gradDv[0];
+        gradDvs[idx * 9 + 1] = gradDv[1];
+        gradDvs[idx * 9 + 2] = gradDv[2];
+        gradDvs[idx * 9 + 3] = gradDv[3];
+        gradDvs[idx * 9 + 4] = gradDv[4];
+        gradDvs[idx * 9 + 5] = gradDv[5];
+        gradDvs[idx * 9 + 6] = gradDv[6];
+        gradDvs[idx * 9 + 7] = gradDv[7];
+        gradDvs[idx * 9 + 8] = gradDv[8];
+    }
+}
+
 template<typename T, int BLOCK_DIM>
 __global__ void initialize_contact_velocities(const size_t n_contacts,
     T* contact_vel,
@@ -963,38 +1455,9 @@ __device__ __host__ inline void get_color_coordinates(UINT x, UINT y, UINT z, UI
 // SAP model
 template<typename T>
 __device__ void compute_contact_grad_and_hess(
-    const T phi0, const T dt, const T stiffness, const T damping, const T friction_mu, 
-    const T *v0, const T *v_next,
+    const T phi_star, const T dt, const T stiffness, const T damping, const T friction_mu, 
+    const T *v_star, const T *v_next,
     T *C_Hess, T *C_Grad) {
-    /* Solves the contact problem for a single particle against a rigid body
-        assuming the rigid body has infinite mass and inertia.
-
-        Let phi be the penetration distance (positive when penetration occurs) and vn
-        be the relative velocity of the particle with respect to the rigid body in the
-        normal direction (vn>0 when separting). Then we have phi_dot = -vn.
-
-        In the normal direction, the contact force is modeled as a linear elastic system
-        with Hunt-Crossley dissipation.
-
-        f = k * phi_+ * (1 + d * phi_dot)_+
-
-        where phi_+ = max(0, phi)
-
-        The momentum balance in the normal direction becomes
-
-        m(vn_next - vn) = k * dt * (phi0 - dt * vn_next)_+ * (1 - d * vn_next)_+
-
-        where we used the fact that phi = phi0 - dt * vn_next. This is a quadratic
-        equation in vn_next, and we solve it to get the next velocity vn_next.
-
-        The quadratic equation is ax^2 + bx + c = 0, where
-
-        a = k * d * dt^2
-        b = -m - (k * dt * (dt + d * phi0))
-        c = k * dt * phi0 + m * vn
-
-        After solving for vn_next, we check if the friction force lies in the friction
-        cone, if not, we project the velocity back into the friction cone. */
     constexpr int kZAxis = 2;
 
     // NOTE: follow the pattern in https://github.com/RobotLocomotion/drake/blob/master/multibody/contact_solvers/sap/sap_hunt_crossley_constraint.cc
@@ -1003,8 +1466,8 @@ __device__ void compute_contact_grad_and_hess(
 
     // NOTE (changyu): in math eqns from (https://arxiv.org/pdf/2312.03908) and in code,
     // ϕ differs by a sign.
-    const T phi = phi0 - dt * v_next[kZAxis];
-    // If (-ϕ0 - δt vn)+ or (1 − dvn)+ equals zero, no impulse should be applied
+    const T phi = phi_star - dt * (v_next[kZAxis] - v_star[kZAxis]);
+    // If (-ϕ* - δt (vn - v*))+ or (1 − dvn)+ equals zero, no impulse should be applied
     if (T(1.) - damping * v_next[kZAxis] <= 0 || phi <= 0) { // Quick exits
         #pragma unroll
         for (int i = 0; i < 9; ++i) C_Hess[i] = 0;
@@ -1014,32 +1477,35 @@ __device__ void compute_contact_grad_and_hess(
     else {
         // normal component (Compliant Contact)
         // fn(ϕ, vn) = k (−ϕ)+ (1 − dvn)+
-        // γn(vn) = n(vn; ϕ0) = δt fn(ϕ0 + δt vn, vn)
-        //        = δt k (-ϕ0 - δt vn)+ (1 − dvn)+
-        const T yn = dt * stiffness * (phi0 - dt * v_next[kZAxis]) * (T(1.) - damping * v_next[kZAxis]); // Eq. 13
+        // γn(vn) = n(vn; ϕ*) = δt fn(ϕ* + δt (vn - v*), vn)
+        //        = δt k (-ϕ* - δt (vn - v*))+ (1 − dvn)+
+        const T yn = dt * stiffness * (phi_star - dt * (v_next[kZAxis] - v_star[kZAxis])) * (T(1.) - damping * v_next[kZAxis]); // Eq. 13
 
         // ∂²ln / ∂vn² = - δt ∂ fn / ∂vn
-        //               = - δt ∂ fn(ϕ0 + δt vn, vn) / ∂vn
-        //               = - δt k ∂ (-ϕ0 - δt vn)+ (1 − dvn)+ / ∂vn
-        // when both (-ϕ0 - δt vn) > 0 and (1 − dvn) > 0 satisfied
-        //               = - δt k ∂ (-ϕ0 - δt vn) (1 − dvn) / ∂vn
-        //               = - δt k ∂ (-ϕ0 - δt vn + ϕ0 dvn + d δt vn²) / ∂vn
-        //               = - δt k (- δt + ϕ0 d + 2 d δt vn)
-        const T d2lndvn2 = - dt * stiffness * (-dt -phi0 * damping + T(2.) * damping * dt * v_next[kZAxis]); // Eq. 8
+        //               = - δt ∂ fn(ϕ* + δt (vn - v*), vn) / ∂vn
+        //               = - δt k ∂ (-ϕ* - δt (vn - v*))+ (1 − dvn)+ / ∂vn
+        // when both (-ϕ* - δt (vn - v*)) > 0 and (1 − dvn) > 0 satisfied
+        //               = - δt k ∂ (-ϕ* - δt vn + δt v*) (1 − dvn) / ∂vn
+        //               = - δt k ∂ (-ϕ* - δt vn + δt v* + ϕ* d vn + d δt vn² - d δt v* vn) / ∂vn
+        //               = - δt k (- δt + ϕ* d + 2 d δt vn - d δt v*)
+        const T d2lndvn2 = -dt * stiffness * (-dt 
+                                              + (-phi_star) * damping 
+                                              + T(2.) * damping * dt * v_next[kZAxis] 
+                                              - damping * dt * v_star[kZAxis]); // Eq. 8
 
         // frictional component (Lagged Model)
         // For a physical model of compliance for which γn is only a function of vn
 
-        // γn0 = δt fn(ϕ0, vn0) = δt k (−ϕ0)+ (1 − dvn0)+
-        const T yn0 = max(stiffness * dt * phi0 * (T(1.) - damping * v0[kZAxis]), T(0.));
+        // γn* = δt fn(ϕ*, vn*) = δt k (−ϕ*)+ (1 − dvn*)+
+        const T yn_star = max(stiffness * dt * phi_star * (T(1.) - damping * v_star[kZAxis]), T(0.));
 
         const T ts_coeff = sqrt(v_next[0] * v_next[0] + v_next[1] * v_next[1] + config::epsv<T> * config::epsv<T>);
         const T ts_hat[2] = {v_next[0] / ts_coeff, v_next[1] / ts_coeff}; // Eq. 18
 
-        // γt = -μ * γn0 * t̂_s 
+        // γt = -μ * γn* * t̂_s 
         const T yt[2] = {
-            -friction_mu * yn0 * ts_hat[0], 
-            -friction_mu * yn0 * ts_hat[1]
+            -friction_mu * yn_star * ts_hat[0], 
+            -friction_mu * yn_star * ts_hat[1]
         }; // Eq. 33
 
         T P_ts_hat[4];
@@ -1048,8 +1514,8 @@ __device__ void compute_contact_grad_and_hess(
             T(1.) - P_ts_hat[0], -P_ts_hat[1],
             -P_ts_hat[2], T(1.) - P_ts_hat[3]
         };
-        const T d2ltdvt2_coeff = friction_mu * yn0 / ts_coeff; // Eq. 33, ts_coeff = ts_soft_norm + epsv
-        // ∂²lt / ∂vt² = μ * γn0 * (P⊥(t̂_s) / (||v_t||_s + ε_s))
+        const T d2ltdvt2_coeff = friction_mu * yn_star / ts_coeff; // Eq. 33, ts_coeff = ts_soft_norm + epsv
+        // ∂²lt / ∂vt² = μ * γn* * (P⊥(t̂_s) / (||v_t||_s + ε_s))
         const T d2ltdvt2[4] = {
             d2ltdvt2_coeff * P_perp_ts_hat[0],
             d2ltdvt2_coeff * P_perp_ts_hat[1],
@@ -1079,7 +1545,7 @@ template<typename T, int BLOCK_DIM, bool JACOBI>
 __global__ void contact_particle_to_grid_kernel(const size_t n_particles,
     const T* contact_pos,
     const T* contact_vel,
-    const T* velocities,
+    const T* contact_vel_star,
     const T* volumes,
     const uint32_t* contact_mpm_id,
     const T* contact_dist,
@@ -1141,26 +1607,16 @@ __global__ void contact_particle_to_grid_kernel(const size_t n_particles,
         }
 
         const T mass = volumes[contact_mpm_id[idx]] * config::DENSITY<T>;
-        const T* particle_vn = &velocities[contact_mpm_id[idx] * 3];
+        const T* particle_v_star = &contact_vel_star[idx * 3];
         const T* particle_v = &contact_vel[idx * 3];
 
         T nhat_W[3] = {contact_normal[idx * 3 + 0], contact_normal[idx * 3 + 1], contact_normal[idx * 3 + 2]};
-        // TODO (changyu): const GpuT phi0 = -(
-        // static_cast<GpuT>(mpm_contact_pairs[i].penetration_distance) + 
-        //     (mpm_state->positions_host()[mpm_contact_pairs[i].particle_in_contact_index] - 
-        //     mpm_contact_pairs[i].particle_in_contact_position.template cast<GpuT>()).dot(nhat_W)
-        //   );
-        T phi0 = -contact_dist[idx];
-#ifdef DEBUG
-        if (phi0 < 0) {
-            printf("IMPOSSIBLE!!!\n");
-        }
-#endif
+        T phi_star = -contact_dist[idx];
 
-        T vn_rel_W[3] = {
-            particle_vn[0] - contact_rigid_v[idx * 3 + 0],
-            particle_vn[1] - contact_rigid_v[idx * 3 + 1],
-            particle_vn[2] - contact_rigid_v[idx * 3 + 2]
+        T v_star_rel_W[3] = {
+            particle_v_star[0] - contact_rigid_v[idx * 3 + 0],
+            particle_v_star[1] - contact_rigid_v[idx * 3 + 1],
+            particle_v_star[2] - contact_rigid_v[idx * 3 + 2]
         };
         T v_rel_W[3] = {
             particle_v[0] - contact_rigid_v[idx * 3 + 0],
@@ -1173,12 +1629,12 @@ __global__ void contact_particle_to_grid_kernel(const size_t n_particles,
         make_from_one_unit_vector(nhat_W, kZAxis, R_WC);
         transpose<3, 3, T>(R_WC, R_CW);
 
-        T vn_C[3], v_next_C[3]; // in the contact local coordinate
-        matmul<3, 3, 1, T>(R_CW, vn_rel_W, vn_C);
+        T v_star_C[3], v_next_C[3]; // in the contact local coordinate
+        matmul<3, 3, 1, T>(R_CW, v_star_rel_W, v_star_C);
         matmul<3, 3, 1, T>(R_CW, v_rel_W, v_next_C);
 
         T lc_Hess_C[9], lc_Grad_C[3]; // hess and grad in the contact local coordinate
-        compute_contact_grad_and_hess(phi0, dt, stiffness, damping, friction_mu, vn_C, v_next_C, lc_Hess_C, lc_Grad_C);
+        compute_contact_grad_and_hess(phi_star, dt, stiffness, damping, friction_mu, v_star_C, v_next_C, lc_Hess_C, lc_Grad_C);
         
         
         // hess and grad in the world local coordinate
@@ -1326,7 +1782,7 @@ template<typename T, int BLOCK_DIM, bool JACOBI, bool SOLVE_DF_DDF>
 __global__ void grid_to_particle_vdb_line_search_kernel(const size_t n_particles,
     const T* contact_pos,
     const T* contact_vel,
-    const T* velocities,
+    const T* contact_vel_star,
     const T* volumes,
     const uint32_t* contact_mpm_id,
     const T* contact_dist,
@@ -1435,20 +1891,15 @@ __global__ void grid_to_particle_vdb_line_search_kernel(const size_t n_particles
         }
 
         const T mass = volumes[contact_mpm_id[idx]] * config::DENSITY<T>;
-        const T* v_p_n = &velocities[contact_mpm_id[idx] * 3];
+        const T* v_p_star = &contact_vel_star[idx * 3];
 
         T nhat_W[3] = {contact_normal[idx * 3 + 0], contact_normal[idx * 3 + 1], contact_normal[idx * 3 + 2]};
-        T phi0 = -contact_dist[idx];
-#ifdef DEBUG
-        if (phi0 < 0) {
-            printf("IMPOSSIBLE!!!\n");
-        }
-#endif
+        T phi_star = -contact_dist[idx];
 
-        T vn_rel_W[3] = {
-            v_p_n[0] - contact_rigid_v[idx * 3 + 0],
-            v_p_n[1] - contact_rigid_v[idx * 3 + 1],
-            v_p_n[2] - contact_rigid_v[idx * 3 + 2]
+        T v_star_rel_W[3] = {
+            v_p_star[0] - contact_rigid_v[idx * 3 + 0],
+            v_p_star[1] - contact_rigid_v[idx * 3 + 1],
+            v_p_star[2] - contact_rigid_v[idx * 3 + 2]
         };
         T v_current_rel_W[3] = {
             v_p_current[0] - contact_rigid_v[idx * 3 + 0],
@@ -1467,39 +1918,53 @@ __global__ void grid_to_particle_vdb_line_search_kernel(const size_t n_particles
         make_from_one_unit_vector(nhat_W, kZAxis, R_WC);
         transpose<3, 3, T>(R_WC, R_CW);
 
-        T vn_C[3], v_current_C[3], v_next_C[3]; // in the contact local coordinate
-        matmul<3, 3, 1, T>(R_CW, vn_rel_W, vn_C);
+        T v_star_C[3], v_current_C[3], v_next_C[3]; // in the contact local coordinate
+        matmul<3, 3, 1, T>(R_CW, v_star_rel_W, v_star_C);
         matmul<3, 3, 1, T>(R_CW, v_current_rel_W, v_current_C);
         matmul<3, 3, 1, T>(R_CW, v_next_rel_W, v_next_C);
 
         // lc(v_p(v_i))
-        auto lc = [&](const T* v, const T* v0) {
+        auto lc = [&](const T* v, const T* v_star) {
             // frictional component (Lagged Model)
-            // lt(v_t) = μ * γn0 * ||v_t||_s
-            const T yn0 = max(stiffness * dt * phi0 * (T(1.) - damping * v0[kZAxis]), T(0.));
-            const T lt = friction_mu * yn0 * (sqrt(v[0] * v[0] + v[1] * v[1] + config::epsv<T> * config::epsv<T>) - config::epsv<T>); // Eq. 33
+            // lt(v_t) = μ * (γn*) * ||v_t||_s
+            const T yn_star = max(stiffness * dt * phi_star * (T(1.) - damping * v_star[kZAxis]), T(0.));
+            const T lt = friction_mu * yn_star * (sqrt(v[0] * v[0] + v[1] * v[1] + config::epsv<T> * config::epsv<T>) - config::epsv<T>); // Eq. 33
 
             // normal component (Compliant Contact)
 
-            // vˆ = min(−ϕ0 / δt, 1 / d),
-            T v_hat = min(phi0 / dt, T(1.) / damping);
+            // we need to find the v such that −ϕ = 0
+            // Starting with −ϕ = −ϕ* - dt * (v - v*)
+            // Setting -ϕ = 0 and solving for v, we get v = (-ϕ*) / dt + v*
+            // vˆ = min(−ϕ* / δt, 1 / d),
+            T v_hat = min(phi_star / dt + v_star[kZAxis], T(1.) / damping);
 
-            // N(vn) = N+(min(vn, vˆ); f0)
+            // N(vn) = N+(min(vn, vˆ); f*)
             const T min_vn_v_hat = min(v_hat, v[kZAxis]);
 
-            // N+(vn; ϕ0) = δt k [−vn (ϕ0 + 1/2 δt vn) + d vn²/2 (ϕ0 + 2/3 δt vn)]
-            //            = δt k [−ϕ0 vn  - 1/2 δt vn² + 1/2 d ϕ0 vn² + 1/3 d δt vn³]
-            //            = δt k [1/3 d δt vn³ + 1/2 (d ϕ0 - δt) vn² - ϕ0 vn]
-            //            = ln_A vn³ + ln_B vn² + ln_C vn
+            // N+(vn; ϕ*) = ∫ n(vn; ϕ*) ∂vn
+            //            = ∫ δt fn(ϕ* + δt (vn - v*), vn) ∂vn
+            //            = ∫ δt k (-ϕ* - δt (vn - v*))+ (1 − dvn)+ ∂vn
+
+            // when both (-ϕ* - δt (vn - v*)) > 0 and (1 − dvn) > 0 satisfied
+            //            = ∫ δt k (-ϕ* - δt (vn - v*)) (1 − dvn) ∂vn
+            //            = ∫ δt k (-ϕ* - δt vn + δt v*) (1 − dvn) ∂vn
+            //            = ∫ δt k (-ϕ* - δt vn + δt v* + d ϕ* vn + d δt vn² - d δt v* vn) ∂vn
+
+            //            = ∫ δt k (d δt vn² + d ϕ* vn - δt vn - d δt v* vn -ϕ* + δt v*) ∂vn
+            //            = ∫ δt k [ d δt vn² + (d ϕ* - δt - d δt v*) vn -ϕ* + δt v*] ∂vn
+            //            = δt k [ 1/3 d δt vn³ + 1/2 (d ϕ* - δt - d δt v*) vn² + (-ϕ* + δt v*) vn]
+            //            = 1/3 δt² k d vn³ + 1/2 δt k (d ϕ* - δt - d δt v*) + δt k (-ϕ* + δt v*) vn
+            //            = N_A vn³ + N_B vn² + N_C vn
+
             // where N_A = 1/3 δt² k d,
-            //       N_B = 1/2 δt k (d ϕ0 - δt) 
-            //       N_C = δt k * (-ϕ0)
+            //       N_B = 1/2 δt k (d ϕ* - δt - d δt v*) 
+            //       N_C = δt k * (-ϕ* + δt v*)
             const T N_A = T(1. / 3.) * dt * dt * stiffness * damping;
-            const T N_B = T(1. / 2.) * dt * stiffness * (-damping * phi0 - dt);
-            const T N_C = dt * stiffness * phi0;
+            const T N_B = T(1. / 2.) * dt * stiffness * (-damping * phi_star - dt - damping * dt * v_star[kZAxis]);
+            const T N_C = dt * stiffness * (phi_star + dt * v_star[kZAxis]);
             const T N_vn = N_A * min_vn_v_hat * min_vn_v_hat * min_vn_v_hat 
-                       + N_B * min_vn_v_hat * min_vn_v_hat 
-                       + N_C * min_vn_v_hat;
+                         + N_B * min_vn_v_hat * min_vn_v_hat 
+                         + N_C * min_vn_v_hat;
             
             // ln(vn) = −N(vn),
             const T ln = -N_vn; 
@@ -1515,10 +1980,10 @@ __global__ void grid_to_particle_vdb_line_search_kernel(const size_t n_particles
             }
         }
         if (global_line_search) {
-            atomicAdd(g_E1, lc(v_next_C, vn_C));
+            atomicAdd(g_E1, lc(v_next_C, v_star_C));
             if constexpr (SOLVE_DF_DDF) {
                 T lc_Hess_C[9], lc_Grad_C[3]; // hess and grad in the contact local coordinate
-                compute_contact_grad_and_hess(phi0, dt, stiffness, damping, friction_mu, vn_C, v_next_C, lc_Hess_C, lc_Grad_C);
+                compute_contact_grad_and_hess(phi_star, dt, stiffness, damping, friction_mu, v_star_C, v_next_C, lc_Hess_C, lc_Grad_C);
                 T global_dir_C[3];
                 matmul<3, 3, 1, T>(R_CW, vp_search_dir_W, global_dir_C);
                 atomicAdd(g_dE1, dot<3>(lc_Grad_C, global_dir_C));
@@ -1530,16 +1995,16 @@ __global__ void grid_to_particle_vdb_line_search_kernel(const size_t n_particles
             if constexpr (JACOBI) {
                 printf("ERROR, JACOBI CANNOT USE LOCAL LINE-SEARCH!!!!!!!!!!!!!!!!!!!!!!!\n");
             }
-            atomicAdd(&g_E1[color_index], lc(v_next_C, vn_C));
+            atomicAdd(&g_E1[color_index], lc(v_next_C, v_star_C));
         }
         if (eval_E0) {
             if (global_line_search) {
-                atomicAdd(g_E0, lc(v_current_C, vn_C));
+                atomicAdd(g_E0, lc(v_current_C, v_star_C));
             } else {
                 if constexpr (JACOBI) {
                     printf("ERROR, JACOBI CANNOT USE LOCAL LINE-SEARCH!!!!!!!!!!!!!!!!!!!!!!!\n");
                 }
-                atomicAdd(&g_E0[color_index], lc(v_current_C, vn_C));
+                atomicAdd(&g_E0[color_index], lc(v_current_C, v_star_C));
             }
         }
     }
@@ -1616,12 +2081,16 @@ __global__ void update_global_energy_grid_kernel(
     const T* g_v_star,
     T* g_momentum,
     const T* g_Dir,
+    const T* g_Kdv0,
+    const T* g_Kdv,
+    const T* g_Kdir,
     T* global_E0,
     T* global_E1,
     T* global_dE1,
     T* global_d2E1,
     const uint32_t g_color_mask,
     const T global_alpha,
+    const T dt,
     const bool eval_E0) {
     uint32_t idx = threadIdx.x + blockDim.x * blockIdx.x;
     if (idx < touched_cells_cnt) {
@@ -1653,13 +2122,25 @@ __global__ void update_global_energy_grid_kernel(
             }
             if (eval_E0) {
                 atomicAdd(global_E0, T(0.5) * mass * norm_sqr<3>(v_current_rel));
+                // elasticity energy: 1/2 dt^2 ||dv||^2_K = 1/2 dt^2 * dv^T Kdv
+                const T* Kdv0 = &g_Kdv0[cell_idx * 3];
+                atomicAdd(global_E0, T(0.5) * dt * dt * dot<3>(v_current_rel, Kdv0));
             }
 
             // (1/2) * ||v_i - v_i^*||_M^2
             atomicAdd(global_E1, T(0.5) * mass * norm_sqr<3>(v_next_rel));
+            // elasticity energy: 1/2 dt^2 ||dv||^2_K = 1/2 dt^2 * dv^T Kdv
+            const T* Kdv = &g_Kdv[cell_idx * 3];
+            atomicAdd(global_E1, T(0.5) * dt * dt * dot<3>(v_next_rel, Kdv));
+
             if constexpr(SOLVE_DF_DDF) {
+                const T* Kdir = &g_Kdir[cell_idx * 3];
+
                 atomicAdd(global_dE1,  mass * dot<3>(v_next_rel, Dir));
+                atomicAdd(global_dE1, dt * dt * dot<3>(v_next_rel, Kdir));
+
                 atomicAdd(global_d2E1, mass * norm_sqr<3>(Dir));
+                atomicAdd(global_d2E1, dt * dt * dot<3>(Dir, Kdir));
             }
         }
     }
@@ -1673,7 +2154,8 @@ __global__ void apply_global_line_search_grid_kernel(
     T* g_momentum,
     const T* g_Dir,
     const uint32_t g_color_mask,
-    const T global_alpha) {
+    const T global_alpha,
+    const T dt) {
     uint32_t idx = threadIdx.x + blockDim.x * blockIdx.x;
     if (idx < touched_cells_cnt) {
         uint32_t block_idx = g_touched_ids[idx >> (config::G_BLOCK_BITS * 3)];
@@ -1686,6 +2168,14 @@ __global__ void apply_global_line_search_grid_kernel(
             g_vel[0] += global_alpha * Dir[0];
             g_vel[1] += global_alpha * Dir[1];
             g_vel[2] += global_alpha * Dir[2];
+            
+            // NOTE (changyu): check CFL condition here
+#if (DEBUG)
+            const T CFL_dt = config::G_DX<T> / (norm<3>(g_vel) + 1e-10);
+            if (dt > CFL_dt) {
+                printf("dt exceds CFL dt limit (%lf)!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", dt / CFL_dt);
+            }
+#endif
         }
     }
 }
diff --git a/multibody/gpu_mpm/cuda_mpm_model.cu b/multibody/gpu_mpm/cuda_mpm_model.cu
index 03826ebc4590..b7cda4b691a3 100644
--- a/multibody/gpu_mpm/cuda_mpm_model.cu
+++ b/multibody/gpu_mpm/cuda_mpm_model.cu
@@ -50,6 +50,7 @@ void GpuMpmState<T>::Finalize() {
         CUDA_SAFE_CALL(cudaMalloc(&particle_buffer_[i].d_velocities, sizeof(Vec3<T>) * n_particles_));
         CUDA_SAFE_CALL(cudaMalloc(&particle_buffer_[i].d_volumes, sizeof(T) * n_particles_));
         CUDA_SAFE_CALL(cudaMalloc(&particle_buffer_[i].d_affine_matrices, sizeof(Mat3<T>) * n_particles_));
+        CUDA_SAFE_CALL(cudaMalloc(&particle_buffer_[i].d_affine_matrices_star, sizeof(Mat3<T>) * n_particles_));
 
         CUDA_SAFE_CALL(cudaMalloc(&particle_buffer_[i].d_pids, sizeof(int) * n_particles_));
         CUDA_SAFE_CALL(cudaMalloc(&particle_buffer_[i].d_sort_keys, sizeof(uint32_t) * n_particles_));
@@ -75,12 +76,17 @@ void GpuMpmState<T>::Finalize() {
                                       cudaMemcpyHostToDevice));
             CUDA_SAFE_CALL(cudaMemset(particle_buffer_[i].d_volumes, 0, sizeof(T) * n_particles_));
             CUDA_SAFE_CALL(cudaMemset(particle_buffer_[i].d_affine_matrices, 0, sizeof(Mat3<T>) * n_particles_));
+            CUDA_SAFE_CALL(cudaMemset(particle_buffer_[i].d_affine_matrices_star, 0, sizeof(Mat3<T>) * n_particles_));
         }
     }
     
     // scratch data
     CUDA_SAFE_CALL(cudaMalloc(&d_forces_, sizeof(Vec3<T>) * n_particles_));
     CUDA_SAFE_CALL(cudaMalloc(&d_taus_, sizeof(Mat3<T>) * n_particles_));
+    CUDA_SAFE_CALL(cudaMalloc(&d_dforces_, sizeof(Vec3<T>) * n_particles_));
+    CUDA_SAFE_CALL(cudaMalloc(&d_dtaus_, sizeof(Mat3<T>) * n_particles_));
+    CUDA_SAFE_CALL(cudaMalloc(&d_dvs_, sizeof(Vec3<T>) * n_particles_));
+    CUDA_SAFE_CALL(cudaMalloc(&d_gradDvs_, sizeof(Mat3<T>) * n_particles_));
     CUDA_SAFE_CALL(cudaMalloc(&d_index_mappings_, sizeof(int) * n_particles_));
     std::vector<int> initial_index_mappings(n_particles_);
     std::iota(initial_index_mappings.begin(), initial_index_mappings.end(), 0);
@@ -103,6 +109,9 @@ void GpuMpmState<T>::Finalize() {
     CUDA_SAFE_CALL(cudaMemset(grid_buffer_.d_g_touched_cnt, 0, sizeof(uint32_t)));
 
     CUDA_SAFE_CALL(cudaMalloc(&d_g_Hess_, config::G_DOMAIN_VOLUME * sizeof(Mat3<T>)));
+    CUDA_SAFE_CALL(cudaMalloc(&d_g_Kdv0_, config::G_DOMAIN_VOLUME * sizeof(Vec3<T>)));
+    CUDA_SAFE_CALL(cudaMalloc(&d_g_Kdir_, config::G_DOMAIN_VOLUME * sizeof(Vec3<T>)));
+    CUDA_SAFE_CALL(cudaMalloc(&d_g_Kdv_, config::G_DOMAIN_VOLUME * sizeof(Vec3<T>)));
     CUDA_SAFE_CALL(cudaMalloc(&d_g_Grad_, config::G_DOMAIN_VOLUME * sizeof(Vec3<T>)));
     CUDA_SAFE_CALL(cudaMalloc(&d_g_Dir_, config::G_DOMAIN_VOLUME * sizeof(Vec3<T>)));
     CUDA_SAFE_CALL(cudaMalloc(&d_g_alpha_, config::G_DOMAIN_VOLUME * sizeof(T)));
@@ -128,6 +137,7 @@ void GpuMpmState<T>::Destroy() {
         CUDA_SAFE_CALL(cudaFree(particle_buffer_[i].d_velocities));
         CUDA_SAFE_CALL(cudaFree(particle_buffer_[i].d_volumes));
         CUDA_SAFE_CALL(cudaFree(particle_buffer_[i].d_affine_matrices));
+        CUDA_SAFE_CALL(cudaFree(particle_buffer_[i].d_affine_matrices_star));
 
         CUDA_SAFE_CALL(cudaFree(particle_buffer_[i].d_pids));
         CUDA_SAFE_CALL(cudaFree(particle_buffer_[i].d_sort_keys));
@@ -138,6 +148,7 @@ void GpuMpmState<T>::Destroy() {
         particle_buffer_[i].d_velocities = nullptr;
         particle_buffer_[i].d_volumes = nullptr;
         particle_buffer_[i].d_affine_matrices = nullptr;
+        particle_buffer_[i].d_affine_matrices_star = nullptr;
         particle_buffer_[i].d_pids = nullptr;
         particle_buffer_[i].d_sort_keys = nullptr;
         particle_buffer_[i].d_sort_ids = nullptr;
@@ -145,12 +156,20 @@ void GpuMpmState<T>::Destroy() {
 
     CUDA_SAFE_CALL(cudaFree(d_forces_));
     CUDA_SAFE_CALL(cudaFree(d_taus_));
+    CUDA_SAFE_CALL(cudaFree(d_dforces_));
+    CUDA_SAFE_CALL(cudaFree(d_dtaus_));
+    CUDA_SAFE_CALL(cudaFree(d_dvs_));
+    CUDA_SAFE_CALL(cudaFree(d_gradDvs_));
     CUDA_SAFE_CALL(cudaFree(d_index_mappings_));
     CUDA_SAFE_CALL(cudaFree(d_deformation_gradients_));
     CUDA_SAFE_CALL(cudaFree(d_Dm_inverses_));
     CUDA_SAFE_CALL(cudaFree(d_indices_));
     d_forces_ = nullptr;
     d_taus_ = nullptr;
+    d_dforces_ = nullptr;
+    d_dtaus_ = nullptr;
+    d_dvs_ = nullptr;
+    d_gradDvs_ = nullptr;
     d_index_mappings_ = nullptr;
     d_deformation_gradients_ = nullptr;
     d_Dm_inverses_ = nullptr;
@@ -168,6 +187,9 @@ void GpuMpmState<T>::Destroy() {
     grid_buffer_.d_g_touched_cnt = nullptr;
 
     CUDA_SAFE_CALL(cudaFree(d_g_Hess_));
+    CUDA_SAFE_CALL(cudaFree(d_g_Kdv0_));
+    CUDA_SAFE_CALL(cudaFree(d_g_Kdir_));
+    CUDA_SAFE_CALL(cudaFree(d_g_Kdv_));
     CUDA_SAFE_CALL(cudaFree(d_g_Grad_));
     CUDA_SAFE_CALL(cudaFree(d_g_Dir_));
     CUDA_SAFE_CALL(cudaFree(d_g_alpha_));
@@ -175,6 +197,9 @@ void GpuMpmState<T>::Destroy() {
     CUDA_SAFE_CALL(cudaFree(d_g_E0_));
     CUDA_SAFE_CALL(cudaFree(d_g_E1_));
     d_g_Hess_ = nullptr;
+    d_g_Kdv0_ = nullptr;
+    d_g_Kdir_ = nullptr;
+    d_g_Kdv_ = nullptr;
     d_g_Grad_ = nullptr;
     d_g_Dir_ = nullptr;
     d_g_alpha_ = nullptr;
diff --git a/multibody/gpu_mpm/cuda_mpm_model.cuh b/multibody/gpu_mpm/cuda_mpm_model.cuh
index d0904973c768..04bf9352de62 100644
--- a/multibody/gpu_mpm/cuda_mpm_model.cuh
+++ b/multibody/gpu_mpm/cuda_mpm_model.cuh
@@ -53,6 +53,8 @@ public:
     const T* current_volumes() const { return particle_buffer_[current_particle_buffer_id_].d_volumes; }
     T* current_affine_matrices() { return particle_buffer_[current_particle_buffer_id_].d_affine_matrices; }
     const T* current_affine_matrices() const { return particle_buffer_[current_particle_buffer_id_].d_affine_matrices; }
+    T* current_affine_matrices_star() { return particle_buffer_[current_particle_buffer_id_].d_affine_matrices_star; }
+    const T* current_affine_matrices_star() const { return particle_buffer_[current_particle_buffer_id_].d_affine_matrices_star; }
 
     int* current_pids() { return particle_buffer_[current_particle_buffer_id_].d_pids; }
     const int* current_pids() const { return particle_buffer_[current_particle_buffer_id_].d_pids; }
@@ -65,6 +67,7 @@ public:
     T* next_velocities() { return particle_buffer_[current_particle_buffer_id_ ^ 1].d_velocities; }
     T* next_volumes() { return particle_buffer_[current_particle_buffer_id_ ^ 1].d_volumes; }
     T* next_affine_matrices() { return particle_buffer_[current_particle_buffer_id_ ^ 1].d_affine_matrices; }
+    T* next_affine_matrices_star() { return particle_buffer_[current_particle_buffer_id_ ^ 1].d_affine_matrices_star; }
     int* next_pids() { return particle_buffer_[current_particle_buffer_id_ ^ 1].d_pids; }
     uint32_t* next_sort_keys() { return particle_buffer_[current_particle_buffer_id_ ^ 1].d_sort_keys; }
     uint32_t* next_sort_ids() { return particle_buffer_[current_particle_buffer_id_ ^ 1].d_sort_ids; }
@@ -75,6 +78,17 @@ public:
     const T* taus() const { return d_taus_; }
     T* deformation_gradients() { return d_deformation_gradients_; }
     const T* deformation_gradients() const { return d_deformation_gradients_; }
+
+    // NOTE (changyu): Implicit states
+    T* dforces() { return d_dforces_; }
+    const T* dforces() const { return d_dforces_; }
+    T* dtaus() { return d_dtaus_; }
+    const T* dtaus() const { return d_dtaus_; }
+    T* dvs() { return d_dvs_; }
+    const T* dvs() const { return d_dvs_; }
+    T* gradDvs() { return d_gradDvs_; }
+    const T* gradDvs() const { return d_gradDvs_; }
+
     T* Dm_inverses() { return d_Dm_inverses_; }
     const T* Dm_inverses() const { return d_Dm_inverses_; }
     int* indices() { return d_indices_; }
@@ -100,6 +114,9 @@ public:
     }
 
     T* grid_Hess() { return d_g_Hess_; }
+    T* grid_Kdv0() { return d_g_Kdv0_; }
+    T* grid_Kdir() { return d_g_Kdir_; }
+    T* grid_Kdv() { return d_g_Kdv_; }
     T* grid_Grad() { return d_g_Grad_; }
     T* grid_Dir() { return d_g_Dir_; }
     T* grid_alpha() { return d_g_alpha_; }
@@ -178,6 +195,10 @@ private:
     // scratch data
     T* d_forces_ = nullptr;       // size: n_faces + n_verts, NO sort
     T* d_taus_ = nullptr;         // size: n_faces + n_verts, NO sort
+    T* d_dforces_ = nullptr;       // size: n_faces + n_verts, NO sort
+    T* d_dtaus_ = nullptr;        // size: n_faces + n_verts, NO sort
+    T* d_dvs_ = nullptr;           // size: n_faces + n_verts, NO sort
+    T* d_gradDvs_ = nullptr;      // size: n_faces + n_verts, NO sort
     // NOTE (changyu): when particle data get sorted, 
     // the index mapping from element/vertex index to particle index will be changed,
     // need to be updated in `compute_sorted_state_kernel`.
@@ -194,6 +215,7 @@ private:
         T* d_velocities = nullptr;  // size: n_faces + n_verts
         T* d_volumes = nullptr;     // size: n_faces + n_verts
         T* d_affine_matrices = nullptr; // size: n_faces + n_verts
+        T* d_affine_matrices_star = nullptr; // size: n_faces + n_verts
 
         // used to work with index_mapping to get the original -> reordered mapping.
         int* d_pids = nullptr; // size: n_faces + n_verts
@@ -251,6 +273,9 @@ private:
 
     // Grid device ptrs for solving coordinate descent
     T* d_g_Hess_ = nullptr;
+    T* d_g_Kdv0_ = nullptr;
+    T* d_g_Kdir_ = nullptr;
+    T* d_g_Kdv_ = nullptr;
     T* d_g_Grad_ = nullptr;
     T* d_g_Dir_  = nullptr;
     T* d_g_alpha_ = nullptr;
diff --git a/multibody/gpu_mpm/cuda_mpm_solver.cu b/multibody/gpu_mpm/cuda_mpm_solver.cu
index e4b639f6c20a..805a0fe7a79e 100644
--- a/multibody/gpu_mpm/cuda_mpm_solver.cu
+++ b/multibody/gpu_mpm/cuda_mpm_solver.cu
@@ -70,21 +70,31 @@ void GpuMpmSolver<T>::RebuildMapping(GpuMpmState<T> *state, bool sort) const {
 }
 
 template<typename T>
-void GpuMpmSolver<T>::CalcFemStateAndForce(GpuMpmState<T> *state, const T& dt) const {
+void GpuMpmSolver<T>::CalcFemStateAndForce(GpuMpmState<T> *state, const T& dt, const bool post_contact) const {
     CUDA_SAFE_CALL(cudaMemset(state->forces(), 0, sizeof(Vec3<T>) * state->n_particles()));
     CUDA_SAFE_CALL(cudaMemset(state->taus(), 0, sizeof(Mat3<T>) * state->n_particles()));
 
-    CUDA_SAFE_CALL((
-        calc_fem_state_and_force_kernel<<<
-        (state->n_faces() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
-        (state->n_faces(), state->indices(), state->index_mappings(), state->current_volumes(), state->current_affine_matrices(), state->Dm_inverses(),
-         state->current_positions(), state->current_velocities(), state->deformation_gradients(),
-         state->forces(), state->taus(), dt)
-        ));
+    if (post_contact) {
+        CUDA_SAFE_CALL((
+            calc_fem_state_and_force_kernel<T, /*POST_CONTACT=*/true><<<
+            (state->n_faces() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+            (state->n_faces(), state->indices(), state->index_mappings(), state->current_volumes(), state->current_affine_matrices(), state->current_affine_matrices_star(), 
+            state->Dm_inverses(), state->current_positions(), state->current_velocities(), state->deformation_gradients(),
+            state->forces(), state->taus(), dt)
+            ));
+    } else {
+        CUDA_SAFE_CALL((
+            calc_fem_state_and_force_kernel<T, /*POST_CONTACT=*/false><<<
+            (state->n_faces() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+            (state->n_faces(), state->indices(), state->index_mappings(), state->current_volumes(), state->current_affine_matrices(), nullptr, 
+            state->Dm_inverses(), state->current_positions(), state->current_velocities(), state->deformation_gradients(),
+            state->forces(), state->taus(), dt)
+            ));
+    }
 }
 
 template<typename T>
-void GpuMpmSolver<T>::ParticleToGrid(GpuMpmState<T> *state, const T& dt) const {
+void GpuMpmSolver<T>::ParticleToGrid(GpuMpmState<T> *state, const T& dt, bool apply_gravity, bool apply_elasticity) const {
     const uint32_t &touched_blocks_cnt = state->grid_touched_cnt_host();
     const uint32_t &touched_cells_cnt = touched_blocks_cnt * config::G_BLOCK_VOLUME;
     if (touched_cells_cnt > 0) {
@@ -94,14 +104,50 @@ void GpuMpmSolver<T>::ParticleToGrid(GpuMpmState<T> *state, const T& dt) const {
         (touched_cells_cnt, state->grid_touched_ids(), state->grid_touched_flags(), state->grid_masses(), state->grid_momentum())
         ));
     }
-    CUDA_SAFE_CALL((
-        particle_to_grid_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE><<<
-        (state->n_particles() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
-        (state->n_particles(), state->current_positions(), state->current_velocities(), state->current_volumes(), state->current_affine_matrices(),
-         state->forces(), state->taus(),
-         state->current_sort_keys(),
-         state->grid_touched_flags(), state->grid_masses(), state->grid_momentum(), dt)
-        ));
+
+    if (!apply_gravity && !apply_elasticity) {
+        CUDA_SAFE_CALL((
+            particle_to_grid_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*APPLY_GRAVITY=*/false, /*APPLY_ELASTICITY=*/false, /*APPLY_KDV=*/false><<<
+            (state->n_particles() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+            (state->n_particles(), state->current_positions(), state->current_velocities(), state->current_volumes(), state->current_affine_matrices(),
+            state->forces(), state->taus(),
+            state->current_sort_keys(),
+            state->grid_touched_flags(), state->grid_masses(), state->grid_momentum(), dt)
+            ));
+    }
+
+    if (apply_gravity && !apply_elasticity) {
+        CUDA_SAFE_CALL((
+            particle_to_grid_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*APPLY_GRAVITY=*/true, /*APPLY_ELASTICITY=*/false, /*APPLY_KDV=*/false><<<
+            (state->n_particles() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+            (state->n_particles(), state->current_positions(), state->current_velocities(), state->current_volumes(), state->current_affine_matrices(),
+            state->forces(), state->taus(),
+            state->current_sort_keys(),
+            state->grid_touched_flags(), state->grid_masses(), state->grid_momentum(), dt)
+            ));
+    }
+
+    if (!apply_gravity && apply_elasticity) {
+        CUDA_SAFE_CALL((
+            particle_to_grid_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*APPLY_GRAVITY=*/false, /*APPLY_ELASTICITY=*/true, /*APPLY_KDV=*/false><<<
+            (state->n_particles() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+            (state->n_particles(), state->current_positions(), state->current_velocities(), state->current_volumes(), state->current_affine_matrices(),
+            state->forces(), state->taus(),
+            state->current_sort_keys(),
+            state->grid_touched_flags(), state->grid_masses(), state->grid_momentum(), dt)
+            ));
+    }
+
+    if (apply_gravity && apply_elasticity) {
+        CUDA_SAFE_CALL((
+            particle_to_grid_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*APPLY_GRAVITY=*/true, /*APPLY_ELASTICITY=*/true, /*APPLY_KDV=*/false><<<
+            (state->n_particles() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+            (state->n_particles(), state->current_positions(), state->current_velocities(), state->current_volumes(), state->current_affine_matrices(),
+            state->forces(), state->taus(),
+            state->current_sort_keys(),
+            state->grid_touched_flags(), state->grid_masses(), state->grid_momentum(), dt)
+            ));
+    }
 }
 
 template<typename T>
@@ -153,10 +199,20 @@ void GpuMpmSolver<T>::UpdateGrid(GpuMpmState<T> *state, int mpm_bc) const {
 template<typename T>
 void GpuMpmSolver<T>::GridToParticle(GpuMpmState<T> *state, const T& dt) const {
     CUDA_SAFE_CALL((
-        grid_to_particle_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*CONTACT_TRANSFER=*/false><<<
+        grid_to_particle_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*CONTACT_TRANSFER=*/false, /*POST_CONTACT=*/false><<<
         (state->n_particles() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
-        (state->n_particles(), state->current_positions(), state->current_velocities(), state->current_affine_matrices(),
-         state->grid_masses(), state->grid_momentum(), dt)
+        (state->n_particles(), state->current_positions(), state->current_velocities(), state->current_affine_matrices(), nullptr,
+         state->grid_masses(), state->grid_momentum(), state->grid_v_star(), dt)
+        ));
+}
+
+template<typename T>
+void GpuMpmSolver<T>::ContactGridToParticle(GpuMpmState<T> *state, const T& dt) const {
+    CUDA_SAFE_CALL((
+        grid_to_particle_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*CONTACT_TRANSFER=*/false, /*POST_CONTACT=*/true><<<
+        (state->n_particles() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+        (state->n_particles(), state->current_positions(), state->current_velocities(), state->current_affine_matrices(), state->current_affine_matrices_star(),
+         state->grid_masses(), state->grid_momentum(), state->grid_v_star(), dt)
         ));
 }
 
@@ -231,13 +287,18 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
         (n_contacts, state->contact_pos(), state->contact_sort_keys(), state->contact_sort_ids())
         ));
     
-    const int max_newton_iterations = 2000;
+    const int max_newton_iterations = 500;
+    const int max_line_search_iterations = 100;
     constexpr bool use_jacobi = true;
     const T kTol = 1e-4;
 
     bool enable_line_search = true;
     const T jacobi_relax_coeff = 0.3;
     const bool global_line_search = use_jacobi;
+    if (!global_line_search) {
+        printf("Currently only support global line search!!!\n");
+        throw;
+    }
     int count = 0;
 
     T norm_dir = 1e10;
@@ -264,11 +325,12 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
 
     // NOTE (changyu): pre-compute contact particle velocity `contact_vel_star` after p2g2g before contact handling
     // then the dv changed by the implicit contact optimization problem can be extacted by `dv = contact_vel - contact_vel_star`.
+    // also, for strong coupling scheme, we track `contact_vel_star` to handle the penetration distance estimation correctly.
     CUDA_SAFE_CALL((
-        grid_to_particle_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*CONTACT_TRANSFER=*/true><<<
+        grid_to_particle_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*CONTACT_TRANSFER=*/true, /*POST_CONTACT=*/false><<<
         (n_contacts + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
-        (n_contacts, state->contact_pos(), state->contact_vel_star(), nullptr,
-        state->grid_masses(), state->grid_momentum(), dt)
+        (n_contacts, state->contact_pos(), state->contact_vel_star(), nullptr, nullptr,
+        state->grid_masses(), state->grid_momentum(), state->grid_v_star(), dt)
         ));
 
     while (norm_dir > kTol && count < max_newton_iterations) {
@@ -293,7 +355,7 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
                 (n_contacts, 
                 state->contact_pos(), 
                 state->contact_vel(), 
-                state->current_velocities(),
+                state->contact_vel_star(),
                 state->current_volumes(),
                 state->contact_mpm_id(), 
                 state->contact_dist(), 
@@ -318,6 +380,46 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
             CUDA_SAFE_CALL(cudaDeviceSynchronize());
             CUDA_SAFE_CALL(cudaMemcpy(&total_grid_DoFs, total_grid_DoFs_d, sizeof(uint32_t), cudaMemcpyDeviceToHost));
             // printf("color(%u) total_grid_DoFs=%u\n", color_mask, total_grid_DoFs);
+
+            // NOTE (changyu): when apply_dir flag is on, 
+            // it computes the differential form Kdv with respect to the search direction
+            // used for exact line search to get grad/hess of energy with respect to the step size.
+            const auto &update_Kdv = [&](T *Kdv_ptr, const T alpha, bool apply_dir=false) {
+                // elasticity line search
+                // STEP 1. dv_i -> dv_p, gradDv_p
+                CUDA_SAFE_CALL((
+                    grid_to_particle_dv_transfer_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE><<<
+                    (state->n_particles() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+                    (state->n_particles(), state->current_positions(), state->dvs(), state->gradDvs(), 
+                    state->grid_momentum(), state->grid_Dir(), state->grid_v_star(), alpha, apply_dir)
+                    ));
+                
+                
+                // STEP 2. dv_p -> dF_p -> dP_p -> dstress_p, dforce_p
+                CUDA_SAFE_CALL(cudaMemset(state->dforces(), 0, sizeof(Vec3<T>) * state->n_particles()));
+                CUDA_SAFE_CALL(cudaMemset(state->dtaus(), 0, sizeof(Mat3<T>) * state->n_particles()));
+                CUDA_SAFE_CALL((
+                calc_implicit_fem_state_and_differential_kernel<<<
+                    (state->n_faces() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+                    (state->n_faces(), state->indices(), state->index_mappings(), state->current_volumes(), state->Dm_inverses(), state->deformation_gradients(),
+                    state->dvs(), state->gradDvs(), state->dforces(), state->dtaus())
+                    ));
+                
+                // STEP 3. dstress_p, dforce_p -> Kdv
+                CUDA_SAFE_CALL((
+                    clean_grid_Kdv_kernel<<<
+                    (touched_cells_cnt + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+                    (touched_cells_cnt, state->grid_touched_ids(), Kdv_ptr)
+                    ));
+                CUDA_SAFE_CALL((
+                    particle_to_grid_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*APPLY_GRAVITY=*/false, /*APPLY_ELASTICITY=*/false, /*APPLY_Kdv=*/true><<<
+                    (state->n_particles() + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
+                    (state->n_particles(), state->current_positions(), nullptr, state->current_volumes(), nullptr,
+                    state->dforces(), state->dtaus(),
+                    state->current_sort_keys(),
+                    state->grid_touched_flags(), nullptr, Kdv_ptr, dt)
+                    ));
+            };
             
             // line search
             const auto & line_search = [&](const T current_alpha) {
@@ -330,7 +432,7 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
                     (n_contacts, 
                     state->contact_pos(), 
                     state->contact_vel(), 
-                    state->current_velocities(),
+                    state->contact_vel_star(),
                     state->current_volumes(),
                     state->contact_mpm_id(), 
                     state->contact_dist(), 
@@ -348,13 +450,14 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
                     global_line_search,
                     current_alpha)
                     ));
+                update_Kdv(state->grid_Kdv(), current_alpha, /*apply_dir=*/false);  // elasticity energy defined for E1
                 CUDA_SAFE_CALL((
                     update_global_energy_grid_kernel<T, true, /*SOLVE_DF_DDF=*/true><<<
                     (touched_cells_cnt + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
                     (touched_cells_cnt, state->grid_touched_ids(), state->grid_masses(),
-                    state->grid_v_star(), state->grid_momentum(), state->grid_Dir(),
+                    state->grid_v_star(), state->grid_momentum(), state->grid_Dir(), nullptr, state->grid_Kdv(), state->grid_Kdir(),
                     nullptr, global_E1_d, global_dE1_d, global_d2E1_d,
-                    /*color_mask*/ 0, current_alpha, /*eval_E0=*/ false)
+                    /*color_mask*/ 0, current_alpha, dt, /*eval_E0=*/ false)
                     ));
                 CUDA_SAFE_CALL(cudaDeviceSynchronize());
                 T E, dE, d2E;
@@ -437,7 +540,7 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
                     }
 
                     // Exit if f(root) is close to zero.
-                    if (abs(std::get<1>(f_root)) < f_tolerance) {
+                    if (abs(std::get<1>(f_root)) < f_tolerance || line_search_cnt >= max_line_search_iterations) {
                         global_line_search_satisfied = true;
                     }
 
@@ -477,7 +580,7 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
                         (n_contacts, 
                         state->contact_pos(), 
                         state->contact_vel(), 
-                        state->current_velocities(),
+                        state->contact_vel_star(),
                         state->current_volumes(),
                         state->contact_mpm_id(), 
                         state->contact_dist(), 
@@ -504,13 +607,20 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
                             global_alpha = root;
                         }
                     } else {
+                        if (line_search_cnt == 0) {
+                            update_Kdv(state->grid_Kdv0(), 0, /*apply_dir=*/false); // elasticity energy defined for E0
+                            update_Kdv(state->grid_Kdir(), 0, /*apply_dir=*/true); // elasticity energy defined for E0, with respect to the search direction
+                        }
+                        update_Kdv(state->grid_Kdv(), global_alpha, /*apply_dir=*/false);  // elasticity energy defined for E1
+
+                        // STEP 4. 1/2 ||dv||^2_A = 1/2 ||dv||^2_(M+dt^2K) = 1/2 ||dv||^2_M + 1/2 dt^2 ||dv||^2_K
                         CUDA_SAFE_CALL((
                             update_global_energy_grid_kernel<T, use_jacobi, /*SOLVE_DF_DDF=*/false><<<
                             (touched_cells_cnt + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
                             (touched_cells_cnt, state->grid_touched_ids(), state->grid_masses(),
-                            state->grid_v_star(), state->grid_momentum(), state->grid_Dir(),
+                            state->grid_v_star(), state->grid_momentum(), state->grid_Dir(), state->grid_Kdv0(), state->grid_Kdv(), state->grid_Kdir(),
                             global_E0_d, global_E1_d, nullptr, nullptr,
-                            color_mask, global_alpha, /*eval_E0=*/line_search_cnt == 0)
+                            color_mask, global_alpha, dt, /*eval_E0=*/line_search_cnt == 0)
                             ));
                         CUDA_SAFE_CALL(cudaDeviceSynchronize());
                         CUDA_SAFE_CALL(cudaMemcpy(&global_E0, global_E0_d, sizeof(T), cudaMemcpyDeviceToHost));
@@ -534,7 +644,7 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
                             (touched_cells_cnt + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
                             (touched_cells_cnt, state->grid_touched_ids(), state->grid_masses(),
                             state->grid_momentum(), state->grid_Dir(),
-                            color_mask, global_alpha)
+                            color_mask, global_alpha, dt)
                             ));
                         CUDA_SAFE_CALL(cudaDeviceSynchronize());
                     }
@@ -555,10 +665,10 @@ void GpuMpmSolver<T>::UpdateContact(GpuMpmState<T> *state, const int frame, cons
             }
             
             CUDA_SAFE_CALL((
-                grid_to_particle_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*CONTACT_TRANSFER=*/true><<<
+                grid_to_particle_kernel<T, config::DEFAULT_CUDA_BLOCK_SIZE, /*CONTACT_TRANSFER=*/true, /*POST_CONTACT=*/false><<<
                 (n_contacts + config::DEFAULT_CUDA_BLOCK_SIZE - 1) / config::DEFAULT_CUDA_BLOCK_SIZE, config::DEFAULT_CUDA_BLOCK_SIZE>>>
-                (n_contacts, state->contact_pos(), state->contact_vel(), nullptr,
-                state->grid_masses(), state->grid_momentum(), dt)
+                (n_contacts, state->contact_pos(), state->contact_vel(), nullptr, nullptr,
+                state->grid_masses(), state->grid_momentum(), state->grid_v_star(), dt)
                 ));
             // throw;
             grid_DoFs += total_grid_DoFs;
diff --git a/multibody/gpu_mpm/cuda_mpm_solver.cuh b/multibody/gpu_mpm/cuda_mpm_solver.cuh
index 09d53d9490b6..a290009622c4 100644
--- a/multibody/gpu_mpm/cuda_mpm_solver.cuh
+++ b/multibody/gpu_mpm/cuda_mpm_solver.cuh
@@ -20,10 +20,11 @@ template<typename T>
 class GpuMpmSolver {
 public:
     void RebuildMapping(GpuMpmState<T> *state, bool sort) const;
-    void CalcFemStateAndForce(GpuMpmState<T> *state, const T& dt) const;
-    void ParticleToGrid(GpuMpmState<T> *state, const T& dt) const;
+    void CalcFemStateAndForce(GpuMpmState<T> *state, const T& dt, const bool post_contact=false) const;
+    void ParticleToGrid(GpuMpmState<T> *state, const T& dt, bool apply_gravity=true, bool apply_elasticity=true) const;
     void UpdateGrid(GpuMpmState<T> *state, int mpm_bc = -1) const;
     void GridToParticle(GpuMpmState<T> *state, const T& dt) const;
+    void ContactGridToParticle(GpuMpmState<T> *state, const T& dt) const;
     void GpuSync() const;
     void SyncParticleStateToCpu(GpuMpmState<T> *state) const;
     void Dump(const GpuMpmState<T> &state, std::string filename) const;
diff --git a/multibody/gpu_mpm/math_tools.cuh b/multibody/gpu_mpm/math_tools.cuh
index ac8c1ebf392b..389e7e147d7c 100644
--- a/multibody/gpu_mpm/math_tools.cuh
+++ b/multibody/gpu_mpm/math_tools.cuh
@@ -509,6 +509,133 @@ inline __host__ __device__ void givens_QR(const T *A, T *Q, T *R) {
     }
 }
 
+template<class T>
+inline __host__ __device__ void QR_differential_3x3(const T *Q, const T *R, const T *dF, T *dQ, T *dR) {
+    // TM QtdF = Q.transpose() * dF;
+    T QtdF[9], Qt[9];
+    transpose<3, 3, T>(Q, Qt);
+    matmul<3, 3, 3, T>(Qt, dF, QtdF);
+
+    // T w3 = QtdF(1, 0) / R(0, 0);
+    // T w2 = -QtdF(2, 0) / R(0, 0);
+    // T w1 = (QtdF(2, 1) + w2 * R(0, 1)) / R(1, 1);
+    const T w3 = QtdF[1 * 3 + 0] / R[0 * 3 + 0];
+    const T w2 = -QtdF[2 * 3 + 0] / R[0 * 3 + 0];
+    const T w1 = (QtdF[2 * 3 + 1] + w2 * R[0 * 3 + 1]) / R[1 * 3 + 1];
+
+    // TM QtdQ;
+    // QtdQ << 0, -w3, w2, w3, 0, -w1, -w2, w1, 0;
+    T QtdQ[9] = {
+        0, -w3, w2,
+        w3, 0, -w1,
+        -w2, w1, 0
+    };
+
+    // dQ = Q * QtdQ;
+    matmul<3, 3, 3, T>(Q, QtdQ, dQ);
+
+    // dR = Q.transpose() * dF - QtdQ * R;
+    T QtdQR[9];
+    matmul<3, 3, 3, T>(QtdQ, R, QtdQR);
+    for (int i = 0; i < 9; ++i) dR[i] = QtdF[i] - QtdQR[i];
+}
+
+template<class T>
+inline __host__ __device__ void cofactor_matrix_2x2(const T *F, T *A) {
+    // A(0, 0) = F(1, 1);
+    A[0 * 2 + 0] = F[1 * 2 + 1];
+    // A(1, 0) = -F(0, 1);
+    A[1 * 2 + 0] = -F[0 * 2 + 1];
+    // A(0, 1) = -F(1, 0);
+    A[0 * 2 + 1] = -F[1 * 2 + 0];
+    // A(1, 1) = F(0, 0);
+    A[1 * 2 + 1] = F[0 * 2 + 0];
+}
+
+/**
+   \brief add scaled 2X2 rotational Derivative (dRdF)
+   \param[in] R Rotation matrix of F
+   \param[in] S Symmetric matrix of F
+   \param[in] scale The scale factor
+   \param[out] M The matrix to add scale * dRdF to
+*/
+
+template <class T>
+__device__ __host__
+void add_scaled_rotational_derivative_2x2(const T* R, const T* S, const T scale, T* M) {
+    // T trace_s = S.trace();
+    const T trace_s = S[0 * 2 + 0] + S[1 * 2 + 1];
+    const T scale_over_trace = scale / trace_s;
+    // Matrix<T, 2, 2> RE;
+    // RE << -R(0, 1), R(0, 0), -R(1, 1), R(1, 0);
+    // Vector<T, 4> vec_RE = vec(RE);
+    const T vec_RE[4] = {
+        -R[0 * 2 + 1], -R[1 * 2 + 1], R[0 * 2 + 0], R[1 * 2 + 0]
+    };
+
+    // M.noalias() += scale_over_trace * (vec_RE * (vec_RE.transpose()));
+    #pragma unroll
+    for (int i = 0; i < 4; ++i)
+        #pragma unroll
+        for (int j = 0; j < 4; ++j)
+            M[i * 4 + j] += scale_over_trace * vec_RE[i] * vec_RE[j];
+}
+
+template <class T>
+__device__ __host__
+void add_scaled_rotational_differential_2x2(const T* R, const T* S, const T* dF, const T scale, T* M) {
+    // T trace_s = S.trace();
+    const T trace_s = S[0 * 2 + 0] + S[1 * 2 + 1];
+
+    //Majorly exploited but basically derived from "rotationalDifferential'
+    // T omega = (R(0, 0) * dF(0, 1) + R(1, 0) * dF(1, 1) - R(0, 1) * dF(0, 0) - R(1, 1) * dF(1, 0)) / trace_s * scale;
+    const T omega = (R[0 * 2 + 0] * dF[0 * 2 + 1] 
+                   + R[1 * 2 + 0] * dF[1 * 2 + 1] 
+                   - R[0 * 2 + 1] * dF[0 * 2 + 0] 
+                   - R[1 * 2 + 1] * dF[1 * 2 + 0]) / trace_s * scale;
+
+    // M(0, 0) -= omega * R(0, 1);
+    // M(1, 0) -= omega * R(1, 1);
+    // M(0, 1) += omega * R(0, 0);
+    // M(1, 1) += omega * R(1, 0);
+    M[0 * 2 + 0] -= omega * R[0 * 2 + 1];
+    M[1 * 2 + 0] -= omega * R[1 * 2 + 1];
+    M[0 * 2 + 1] += omega * R[0 * 2 + 0];
+    M[1 * 2 + 1] += omega * R[1 * 2 + 0];
+}
+
+/**
+   \brief 2X2 add scaled d( JF^(-T) )/dF
+   \param[in] F input matrix
+   \param[in] T scale
+   \param[in,out] result d( JF^(-T) )/dF
+*/
+template <class T>
+__device__ __host__
+void add_scaled_cofactor_matrix_derivative_2x2(const T* F, const T scale, T* result) {
+    // result(3, 0) += scale;
+    result[3 * 4 + 0] += scale;
+    // result(2, 1) -= scale;
+    result[2 * 4 + 1] -= scale;
+    // result(1, 2) -= scale;
+    result[1 * 4 + 2] -= scale;
+    // result(0, 3) += scale;
+    result[0 * 4 + 3] += scale;
+}
+
+template <class T>
+__device__ __host__
+void add_scaled_cofactor_matrix_differential_2x2(const T* F, const T* dF, const T scale, T* M) {
+    // M(0, 0) += scale * dF(1, 1);
+    M[0 * 2 + 0] += scale * dF[1 * 2 + 1];
+    // M(1, 0) -= scale * dF(0, 1);
+    M[1 * 2 + 0] -= scale * dF[0 * 2 + 1];
+    // M(0, 1) -= scale * dF(1, 0);
+    M[0 * 2 + 1] -= scale * dF[1 * 2 + 0];
+    // M(1, 1) += scale * dF(0, 0);
+    M[1 * 2 + 1] += scale * dF[0 * 2 + 0];
+}
+
 template<class T>
 inline __host__ __device__ void polar_decompose2x2(const T *A, T *U, T *P) {
     U[0] = T(1); U[1] = T(0);
@@ -677,6 +804,17 @@ inline __host__ __device__ bool cholesky_solve3(const T* H, const T* b, T* x) {
     return true;
 }
 
+template<int n, int m, typename T>
+__device__ __host__
+inline void watch_host(const char *name, const T *M) {
+    printf("%s\n", name);
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < m; j++) {
+            printf("%.5f ", M[i*m+j]);
+        }
+        printf("\n");
+    }
+}
 
 template<int n, int m, typename T>
 __device__ __host__
diff --git a/multibody/plant/deformable_driver.h b/multibody/plant/deformable_driver.h
index 7a541151a325..f485ca068d65 100644
--- a/multibody/plant/deformable_driver.h
+++ b/multibody/plant/deformable_driver.h
@@ -119,7 +119,7 @@ class DeformableDriver : public ScalarConvertibleComponent<T> {
   
   void CalcMpmContactPairs(
       const systems::Context<T>& context, gmpm::GpuMpmState<gmpm::config::GpuT> *mpm_state,
-      gmpm::MpmParticleContactPairs<gmpm::config::GpuT>* result) const {
+      gmpm::MpmParticleContactPairs<gmpm::config::GpuT>* result, const gmpm::config::GpuT margin) const {
     using GpuT = gmpm::config::GpuT;
     DRAKE_ASSERT(result != nullptr);
     long long before_ts = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
@@ -141,10 +141,11 @@ class DeformableDriver : public ScalarConvertibleComponent<T> {
       // always remember it is type GpuT and should be casted to type T explicitly.
       std::vector<geometry::SignedDistanceToPoint<T>> p_to_geometries =
           query_object.geometry_state().ComputeSignedDistanceToPoint(
-            mpm_state->positions_host()[p].template cast<T>(), T(0));
-      // identify those that are in contact, i.e. signed_distance < 0
+            mpm_state->positions_host()[p].template cast<T>(), T(margin));
+      // identify those that are in contact
+      // NOTE: register a contact point whenever (phi0 > -margin) instead of only registering contact points when phi0 > 0.
       for (const auto& p2geometry : p_to_geometries) {
-        if (p2geometry.distance < 0) {
+        if (p2geometry.distance < margin) {
           // if particle is inside rigid body, i.e. in contact
           // note: normal direction
           // NOTE (changyu): we treat each collision pair as an individual collision particle,
@@ -243,21 +244,30 @@ class DeformableDriver : public ScalarConvertibleComponent<T> {
         mpm_solver_.SyncParticleStateToCpu(&mutable_mpm_state);
         mpm_solver_.RebuildMapping(&mutable_mpm_state, false);
         mpm_solver_.CalcFemStateAndForce(&mutable_mpm_state, ddt);
-        mpm_solver_.ParticleToGrid(&mutable_mpm_state, ddt);
+        mpm_solver_.ParticleToGrid(&mutable_mpm_state, ddt, false, true);
         mpm_solver_.UpdateGrid(&mutable_mpm_state, deformable_model_->cpu_mpm_model().config.mpm_bc);
 
-        // NOTE (changyu): update contact information at each substep for weak coupling scheme
-        CalcMpmContactPairs(context, &mutable_mpm_state, &mpm_contact_pairs);
-        mpm_solver_.CopyContactPairs(&mutable_mpm_state, mpm_contact_pairs);
-        mpm_solver_.UpdateContact(&mutable_mpm_state, current_frame, substep, ddt, 
-          deformable_model_->cpu_mpm_model().config.contact_friction_mu,
-          deformable_model_->cpu_mpm_model().config.contact_stiffness, 
-          deformable_model_->cpu_mpm_model().config.contact_damping,
-          deformable_model_->cpu_mpm_model().config.write_files,
-          deformable_model_->cpu_mpm_model().config.exact_line_search);
         mpm_solver_.GridToParticle(&mutable_mpm_state, ddt);
         substep += 1;
       }
+
+      mpm_solver_.SyncParticleStateToCpu(&mutable_mpm_state);
+      mpm_solver_.RebuildMapping(&mutable_mpm_state, false);
+      mpm_solver_.ParticleToGrid(&mutable_mpm_state, dt, true, false);
+      mpm_solver_.UpdateGrid(&mutable_mpm_state, deformable_model_->cpu_mpm_model().config.mpm_bc);
+
+      // NOTE (changyu): update contact information at each substep for strong coupling scheme
+      CalcMpmContactPairs(context, &mutable_mpm_state, &mpm_contact_pairs, GpuT(deformable_model_->cpu_mpm_model().config.margin));
+      mpm_solver_.CopyContactPairs(&mutable_mpm_state, mpm_contact_pairs);
+      mpm_solver_.UpdateContact(&mutable_mpm_state, current_frame, 0, dt, 
+        deformable_model_->cpu_mpm_model().config.contact_friction_mu,
+        deformable_model_->cpu_mpm_model().config.contact_stiffness, 
+        deformable_model_->cpu_mpm_model().config.contact_damping,
+        deformable_model_->cpu_mpm_model().config.write_files,
+        deformable_model_->cpu_mpm_model().config.exact_line_search
+        );
+      mpm_solver_.ContactGridToParticle(&mutable_mpm_state, dt);
+      mpm_solver_.CalcFemStateAndForce(&mutable_mpm_state, dt, true);
       FinalizeExternalContactForces(&mutable_mpm_state, dt);
 
       // logging