ray-project · DmitriGekhtman · Jul 9, 2024 · Jul 11, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/deltacat/compute/compactor_v2/steps/merge.py b/deltacat/compute/compactor_v2/steps/merge.py
@@ -41,7 +41,10 @@
     interface as unimplemented_deltacat_storage,
 )
 from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
-from deltacat.constants import BYTES_PER_GIBIBYTE
+from deltacat.constants import (
+    BYTES_PER_GIBIBYTE,
+    DW_LAST_UPDATED_COLUMN_NAME,
+)
 from deltacat.compute.compactor_v2.constants import (
     MERGE_TIME_IN_SECONDS,
     MERGE_SUCCESS_COUNT,
@@ -162,6 +165,10 @@ def _merge_tables(
     final_table = pa.concat_tables(result_table_list)
     final_table = final_table.drop([sc._PK_HASH_STRING_COLUMN_NAME])
 
+    # TODO: Retrieve sort order policy from the table version metadata instead of hard-coding.
+    if DW_LAST_UPDATED_COLUMN_NAME in final_table.column_names:
+        final_table = final_table.sort_by([(DW_LAST_UPDATED_COLUMN_NAME, "descending")])
+
     return final_table
 
 

diff --git a/deltacat/constants.py b/deltacat/constants.py
@@ -53,3 +53,5 @@
 PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS = 6
 
 MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
+
+DW_LAST_UPDATED_COLUMN_NAME = "dw_last_updated"
diff --git a/deltacat/tests/compute/compact_partition_rebase_test_cases.py b/deltacat/tests/compute/compact_partition_rebase_test_cases.py
@@ -1,4 +1,6 @@
+import datetime as dt
 import pyarrow as pa
+from deltacat.constants import DW_LAST_UPDATED_COLUMN_NAME
 from deltacat.tests.compute.test_util_common import (
     PartitionKey,
     PartitionKeyType,
@@ -65,20 +67,54 @@ class RebaseCompactionTestCaseParams(BaseCompactorTestCase):
             ],
             names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
         ),
-        expected_terminal_compact_partition_result=pa.Table.from_arrays(
+        expected_terminal_compact_partition_result=pa.Table.from_arrays([]),
+        expected_terminal_exception=None,
+        expected_terminal_exception_message=None,
+        do_create_placement_group=False,
+        records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
+        hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
+        read_kwargs_provider=None,
+        drop_duplicates=True,
+        skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
+    ),
+    "2-rebase-sort": RebaseCompactionTestCaseParams(
+        primary_keys={"pk_col_1"},
+        sort_keys=[
+            SortKey.of(key_name="sk_col_1"),
+            SortKey.of(key_name="sk_col_2"),
+        ],
+        partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
+        partition_values=["1"],
+        input_deltas=pa.Table.from_arrays(
             [
                 pa.array([str(i) for i in range(10)]),
-                pa.array([i for i in range(20, 30)]),
+                pa.array([i for i in range(0, 10)]),
                 pa.array(["foo"] * 10),
-                pa.array([i / 10 for i in range(40, 50)]),
+                pa.array([i / 10 for i in range(10, 20)]),
+                pa.array(dt.datetime(year, 1, 1) for year in range(2000, 2010)),
             ],
-            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
+            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1", DW_LAST_UPDATED_COLUMN_NAME],
         ),
+        input_deltas_delta_type=DeltaType.UPSERT,
+        # dw_last_update is in ascending order in the input table.
+        # Expect descending sort on dw_last_updated for each hash bucket.
+        # Since there is only one hash bucket, the order of input rows should be reversed.
+        rebase_expected_compact_partition_result=pa.Table.from_arrays(
+            [
+                pa.array([str(i) for i in reversed(range(10))]),
+                pa.array([i for i in reversed(range(0, 10))]),
+                pa.array(["foo"] * 10),
+                pa.array([i / 10 for i in reversed(range(10, 20))]),
+                pa.array(dt.datetime(year, 1, 1) for year in reversed(range(2000, 2010))),
+            ],
+            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1", DW_LAST_UPDATED_COLUMN_NAME],
+        ),
+        expected_terminal_compact_partition_result=pa.Table.from_arrays([]),
         expected_terminal_exception=None,
         expected_terminal_exception_message=None,
         do_create_placement_group=False,
         records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
-        hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
+        hash_bucket_count=1,
         read_kwargs_provider=None,
         drop_duplicates=True,
         skip_enabled_compact_partition_drivers=[CompactorVersion.V1],

diff --git a/deltacat/tests/compute/compact_partition_test_cases.py b/deltacat/tests/compute/compact_partition_test_cases.py
@@ -1,5 +1,7 @@
+import datetime as dt
 import pyarrow as pa
 from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+from deltacat.constants import DW_LAST_UPDATED_COLUMN_NAME
 from deltacat.tests.compute.test_util_common import (
     offer_iso8601_timestamp_list,
     PartitionKey,

diff --git a/deltacat/tests/compute/test_compact_partition_rebase.py b/deltacat/tests/compute/test_compact_partition_rebase.py
@@ -5,6 +5,7 @@
 import boto3
 from boto3.resources.base import ServiceResource
 import pyarrow as pa
+from deltacat.constants import DW_LAST_UPDATED_COLUMN_NAME
 from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
 from pytest_benchmark.fixture import BenchmarkFixture
 
@@ -274,16 +275,22 @@ def test_compact_partition_rebase_same_source_and_destination(
         compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
     )
     actual_rebase_compacted_table = pa.concat_tables(tables)
-    # if no primary key is specified then sort by sort_key for consistent assertion
-    sorting_cols: List[Any] = (
-        [(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
-    )
-    rebase_expected_compact_partition_result = (
-        rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
-    )
-    actual_rebase_compacted_table = (
-        actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
-    )
+    if DW_LAST_UPDATED_COLUMN_NAME in actual_rebase_compacted_table.column_names:
+        # If DW_LAST_UPDATED_COLUMN_NAME is present, don't sort expected and actual tables;
+        # we want to assert on the order of the rows in the table, to validate sorting on timestamp.
+        pass
+    else:
+        # If DW_LAST_UPDATED_COLUMN_NAME is absent, sort by primary key for consistent assertion.
+        # Sort by sort_key if no primary key is specified.
+        sorting_cols: List[Any] = (
+            [(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
+        )
+        rebase_expected_compact_partition_result = (
+            rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
+        )
+        actual_rebase_compacted_table = (
+            actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
+        )
     assert actual_rebase_compacted_table.equals(
         rebase_expected_compact_partition_result
     ), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"