TorchEasyRec/tzrec/datasets/utils.py at 2c8b93b7f1cae0c326f592fb83424afd4cc02166 · alibaba/TorchEasyRec · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
# Copyright (c) 2024, Alibaba Group;
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#    http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple

import numpy as np
import numpy.typing as npt
import pyarrow as pa
import pyarrow.compute as pc
import torch
from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor, KeyedTensor
from torchrec.streamable import Pipelineable

from tzrec.protos import data_pb2
from tzrec.protos.data_pb2 import FieldType

BASE_DATA_GROUP = "__BASE__"
NEG_DATA_GROUP = "__NEG__"
CROSS_NEG_DATA_GROUP = "__CNEG__"

C_SAMPLE_MASK = "__SAMPLE_MASK__"
C_NEG_SAMPLE_MASK = "__NEG_SAMPLE_MASK__"

HARD_NEG_INDICES = "hard_neg_indices"

# Checkpoint metadata column names injected into RecordBatch
CKPT_SOURCE_ID = "__ckpt_source_id__"  # string column for checkpoint source identifier
CKPT_ROW_IDX = "__ckpt_row_idx__"  # int64 column for absolute row index


def inject_checkpoint_metadata(
    batch: pa.RecordBatch,
    source_id: str,
    global_row_idx: int,
) -> Tuple[pa.RecordBatch, int]:
    """Inject checkpoint metadata (source_id and row_idx) into a batch.

    Args:
        batch: The input record batch.
        source_id: The source identifier for checkpointing.
        global_row_idx: The current global row index.

    Returns:
        A tuple of (new_batch_with_metadata, updated_global_row_idx).
    """
    batch_len = len(batch)
    row_indices = list(range(global_row_idx, global_row_idx + batch_len))
    new_batch = pa.RecordBatch.from_arrays(
        list(batch.columns)
        + [
            pa.array([source_id] * batch_len, type=pa.string()),
            pa.array(row_indices, type=pa.int64()),
        ],
        names=list(batch.schema.names) + [CKPT_SOURCE_ID, CKPT_ROW_IDX],
    )
    return new_batch, global_row_idx + batch_len


FIELD_TYPE_TO_PA = {
    FieldType.INT32: pa.int32(),
    FieldType.INT64: pa.int64(),
    FieldType.FLOAT: pa.float32(),
    FieldType.DOUBLE: pa.float64(),
    FieldType.STRING: pa.string(),
    FieldType.ARRAY_INT32: pa.list_(pa.int32()),
    FieldType.ARRAY_INT64: pa.list_(pa.int64()),
    FieldType.ARRAY_FLOAT: pa.list_(pa.float32()),
    FieldType.ARRAY_DOUBLE: pa.list_(pa.float64()),
    FieldType.ARRAY_STRING: pa.list_(pa.string()),
    FieldType.ARRAY_ARRAY_INT32: pa.list_(pa.list_(pa.int32())),
    FieldType.ARRAY_ARRAY_INT64: pa.list_(pa.list_(pa.int64())),
    FieldType.ARRAY_ARRAY_FLOAT: pa.list_(pa.list_(pa.float32())),
    FieldType.ARRAY_ARRAY_DOUBLE: pa.list_(pa.list_(pa.float64())),
    FieldType.ARRAY_ARRAY_STRING: pa.list_(pa.list_(pa.string())),
    FieldType.MAP_STRING_INT32: pa.map_(pa.string(), pa.int32()),
    FieldType.MAP_STRING_INT64: pa.map_(pa.string(), pa.int64()),
    FieldType.MAP_STRING_FLOAT: pa.map_(pa.string(), pa.float32()),
    FieldType.MAP_STRING_DOUBLE: pa.map_(pa.string(), pa.float64()),
    FieldType.MAP_STRING_STRING: pa.map_(pa.string(), pa.string()),
    FieldType.MAP_INT64_INT32: pa.map_(pa.int64(), pa.int32()),
    FieldType.MAP_INT64_INT64: pa.map_(pa.int64(), pa.int64()),
    FieldType.MAP_INT64_FLOAT: pa.map_(pa.int64(), pa.float32()),
    FieldType.MAP_INT64_DOUBLE: pa.map_(pa.int64(), pa.float64()),
    FieldType.MAP_INT64_STRING: pa.map_(pa.int64(), pa.string()),
    FieldType.MAP_INT32_INT32: pa.map_(pa.int32(), pa.int32()),
    FieldType.MAP_INT32_INT64: pa.map_(pa.int32(), pa.int64()),
    FieldType.MAP_INT32_FLOAT: pa.map_(pa.int32(), pa.float32()),
    FieldType.MAP_INT32_DOUBLE: pa.map_(pa.int32(), pa.float64()),
    FieldType.MAP_INT32_STRING: pa.map_(pa.int32(), pa.string()),
}

# Type name mapping from ODPS-style type str to FieldType enum
# Note: Aliases INT/INT32 and BIGINT/INT64 are handled by normalizing the type string
TYPE_STR_TO_FIELD_TYPE = {
    # Basic types (use canonical names INT32/INT64)
    "INT32": FieldType.INT32,
    "INT64": FieldType.INT64,
    "STRING": FieldType.STRING,
    "FLOAT": FieldType.FLOAT,
    "DOUBLE": FieldType.DOUBLE,
    # Array types (use canonical INT32/INT64 inside)
    "ARRAY<INT32>": FieldType.ARRAY_INT32,
    "ARRAY<INT64>": FieldType.ARRAY_INT64,
    "ARRAY<STRING>": FieldType.ARRAY_STRING,
    "ARRAY<FLOAT>": FieldType.ARRAY_FLOAT,
    "ARRAY<DOUBLE>": FieldType.ARRAY_DOUBLE,
    # Nested array types
    "ARRAY<ARRAY<INT32>>": FieldType.ARRAY_ARRAY_INT32,
    "ARRAY<ARRAY<INT64>>": FieldType.ARRAY_ARRAY_INT64,
    "ARRAY<ARRAY<STRING>>": FieldType.ARRAY_ARRAY_STRING,
    "ARRAY<ARRAY<FLOAT>>": FieldType.ARRAY_ARRAY_FLOAT,
    "ARRAY<ARRAY<DOUBLE>>": FieldType.ARRAY_ARRAY_DOUBLE,
    # Map types (use canonical INT32/INT64 inside)
    "MAP<STRING,INT32>": FieldType.MAP_STRING_INT32,
    "MAP<STRING,INT64>": FieldType.MAP_STRING_INT64,
    "MAP<STRING,STRING>": FieldType.MAP_STRING_STRING,
    "MAP<STRING,FLOAT>": FieldType.MAP_STRING_FLOAT,
    "MAP<STRING,DOUBLE>": FieldType.MAP_STRING_DOUBLE,
    "MAP<INT64,INT32>": FieldType.MAP_INT64_INT32,
    "MAP<INT64,INT64>": FieldType.MAP_INT64_INT64,
    "MAP<INT64,STRING>": FieldType.MAP_INT64_STRING,
    "MAP<INT64,FLOAT>": FieldType.MAP_INT64_FLOAT,
    "MAP<INT64,DOUBLE>": FieldType.MAP_INT64_DOUBLE,
    "MAP<INT32,INT32>": FieldType.MAP_INT32_INT32,
    "MAP<INT32,INT64>": FieldType.MAP_INT32_INT64,
    "MAP<INT32,STRING>": FieldType.MAP_INT32_STRING,
    "MAP<INT32,FLOAT>": FieldType.MAP_INT32_FLOAT,
    "MAP<INT32,DOUBLE>": FieldType.MAP_INT32_DOUBLE,
}


def _normalize_type_str(type_str: str) -> str:
    """Normalize type string.

    1. Converting to uppercase
    2. Removing spaces
    3. Replacing ODPS aliases: BIGINT->INT64, INT->INT32
       (handles both BIGINT/INT64 and INT/INT32 as valid inputs)

    Args:
        type_str: type string to normalize

    Returns:
        normalized type string
    """
    normalized = type_str.upper().strip()
    normalized = re.sub(r"\s+", "", normalized)
    # Use word boundaries to match whole words only
    normalized = re.sub(r"\bBIGINT\b", "INT64", normalized)
    normalized = re.sub(r"\bINT\b", "INT32", normalized)
    return normalized


def get_input_fields_proto(
    data_config: data_pb2.DataConfig,
) -> List[data_pb2.Field]:
    """Get input fields from data_config.input_fields_str or data_config.input_fields.

    If input_fields_str is specified, parse it and return the fields.
    Otherwise, return data_config.input_fields directly.

    Args:
        data_config: DataConfig proto message

    Returns:
        List of Field proto messages
    """
    if data_config.HasField("input_fields_str") and data_config.input_fields_str:
        input_fields_str = data_config.input_fields_str.strip()
        if not input_fields_str:
            return []

        fields = []
        # Split by semicolon, filter out empty parts
        field_parts = [p.strip() for p in input_fields_str.split(";") if p.strip()]
        for field_part in field_parts:
            if ":" not in field_part:
                raise ValueError(
                    f"Invalid input_fields_str format: '{field_part}'. "
                    "Expected format: 'field_name:field_type'"
                )
            name, type_str = field_part.split(":", 1)
            name = name.strip()
            type_str = type_str.strip()

            if not name:
                raise ValueError(
                    f"Empty field name in input_fields_str: '{field_part}'"
                )
            if not type_str:
                raise ValueError(
                    f"Empty field type in input_fields_str: '{field_part}'"
                )

            # Normalize the type string
            normalized_type = _normalize_type_str(type_str)

            if normalized_type not in TYPE_STR_TO_FIELD_TYPE:
                raise ValueError(
                    f"Unknown field type '{type_str}' "
                    f"(normalized: '{normalized_type}') for field '{name}'. "
                    f"Supported types: {list(TYPE_STR_TO_FIELD_TYPE.keys())}"
                )

            field_proto = data_pb2.Field()
            field_proto.input_name = name
            field_proto.input_type = TYPE_STR_TO_FIELD_TYPE[normalized_type]
            fields.append(field_proto)

        return fields
    else:
        # Return the existing input_fields
        return list(data_config.input_fields)


@dataclass
class ParsedData:
    """Internal parsed data structure."""

    name: str


@dataclass
class SparseData(ParsedData):
    """Internal data structure for sparse feature."""

    values: npt.NDArray
    lengths: npt.NDArray
    weights: Optional[npt.NDArray] = None


@dataclass
class DenseData(ParsedData):
    """Internal data structure for dense feature."""

    values: npt.NDArray


@dataclass
class SequenceSparseData(ParsedData):
    """Internal data structure for sequence sparse feature."""

    values: npt.NDArray
    key_lengths: npt.NDArray
    seq_lengths: npt.NDArray


@dataclass
class SequenceDenseData(ParsedData):
    """Internal data structure for sequence dense feature."""

    values: npt.NDArray
    seq_lengths: npt.NDArray


class RecordBatchTensor:
    """PyArrow RecordBatch use Tensor as buffer.

    For efficient transfer data between processes, e.g., mp.Queue.
    """

    def __init__(self, record_batch: Optional[pa.RecordBatch] = None) -> None:
        self._schema = None
        self._buff = None
        if record_batch:
            self._schema = record_batch.schema
            self._buff = torch.UntypedStorage.from_buffer(
                record_batch.serialize(), dtype=torch.uint8
            )

    def get(self) -> Optional[pa.RecordBatch]:
        """Get RecordBatch."""
        if self._buff is not None:
            # pyre-ignore[16]
            return pa.ipc.read_record_batch(
                pa.foreign_buffer(self._buff.data_ptr(), self._buff.size()),
                self._schema,
            )
        else:
            return None


@dataclass
class Batch(Pipelineable):
    """Input Batch."""

    # key of dense_features is data group name
    dense_features: Dict[str, KeyedTensor] = field(default_factory=dict)
    # key of sparse_features is data group name
    sparse_features: Dict[str, KeyedJaggedTensor] = field(default_factory=dict)
    # key of sequence_mulval_lengths is data group name
    #
    # for multi-value sequence, we flatten it, then store values & accumate lengths
    # into sparse_features, store key_lengths & seq_lengths into sequence_mulval_lengths
    #
    # e.g.
    # for the sequence `click_seq`: [[[3, 4], [5]], [6, [7, 8]]]
    # we can denote it in jagged formular with:
    #   values: [3, 4, 5, 6, 7, 8]
    #   key_lengths: [2, 1, 1, 2]
    #   seq_lengths: [2, 2]
    # then:
    #   sparse_features[dg]['click_seq'].values() = [3, 4, 5, 6, 7, 8]  # values
    #   sparse_features[dg]['click_seq'].lengths() = [3, 3]  # accumate lengths
    #   sequence_mulval_lengths[dg]['click_seq'].values() = [2, 1, 1, 2]  # key_lengths
    #   sequence_mulval_lengths[dg]['click_seq'].lengths() = [2, 2]  # seq_lengths
    sequence_mulval_lengths: Dict[str, KeyedJaggedTensor] = field(default_factory=dict)
    # key of sequence_dense_features is feature name
    sequence_dense_features: Dict[str, JaggedTensor] = field(default_factory=dict)
    # key of labels is label name
    labels: Dict[str, torch.Tensor] = field(default_factory=dict)
    # key of jagged_labels is label name
    jagged_labels: Dict[str, JaggedTensor] = field(default_factory=dict)
    # reserved inputs [for predict]
    reserves: RecordBatchTensor = field(default_factory=RecordBatchTensor)
    # size for user side input tile when do inference and INPUT_TILE=2 or 3
    tile_size: int = field(default=-1)
    # sample_weight
    sample_weights: Dict[str, torch.Tensor] = field(default_factory=dict)

    additional_infos: Dict[str, torch.Tensor] = field(default_factory=dict)
    # dummy batch or not
    dummy: bool = field(default=False)
    # checkpoint info: {source_key: max_abs_row}
    checkpoint_info: Optional[Dict[str, int]] = field(default=None)

    def to(self, device: torch.device, non_blocking: bool = False) -> "Batch":
        """Copy to specified device."""
        return Batch(
            dense_features={
                k: v.to(device=device, non_blocking=non_blocking)
                for k, v in self.dense_features.items()
            },
            sparse_features={
                k: v.to(device=device, non_blocking=non_blocking)
                for k, v in self.sparse_features.items()
            },
            sequence_mulval_lengths={
                k: v.to(device=device, non_blocking=non_blocking)
                for k, v in self.sequence_mulval_lengths.items()
            },
            sequence_dense_features={
                k: v.to(device=device, non_blocking=non_blocking)
                for k, v in self.sequence_dense_features.items()
            },
            labels={
                k: v.to(device=device, non_blocking=non_blocking)
                for k, v in self.labels.items()
            },
            jagged_labels={
                k: v.to(device=device, non_blocking=non_blocking)
                for k, v in self.jagged_labels.items()
            },
            reserves=self.reserves,
            tile_size=self.tile_size,
            sample_weights={
                k: v.to(device=device, non_blocking=non_blocking)
                for k, v in self.sample_weights.items()
            },
            additional_infos={
                k: v.to(device=device, non_blocking=non_blocking)
                for k, v in self.additional_infos.items()
            },
            dummy=self.dummy,
            checkpoint_info=self.checkpoint_info,
        )

    def record_stream(self, stream: torch.Stream) -> None:
        """Record which streams have used the tensor."""
        for v in self.dense_features.values():
            # pyre-ignore [6]
            v.record_stream(stream)
        for v in self.sparse_features.values():
            # pyre-ignore [6]
            v.record_stream(stream)
        for v in self.sequence_mulval_lengths.values():
            # pyre-ignore [6]
            v.record_stream(stream)
        for v in self.sequence_dense_features.values():
            # pyre-ignore [6]
            v.record_stream(stream)
        for v in self.labels.values():
            v.record_stream(stream)
        for v in self.jagged_labels.values():
            # pyre-ignore [6]
            v.record_stream(stream)
        for v in self.sample_weights.values():
            v.record_stream(stream)
        for v in self.additional_infos.values():
            v.record_stream(stream)

    def pin_memory(self) -> "Batch":
        """Copy to pinned memory."""
        # TODO(hongsheng.jhs): KeyedTensor do not have pin_memory()
        dense_features = {}
        for k, v in self.dense_features.items():
            dense_features[k] = KeyedTensor(
                keys=v.keys(),
                length_per_key=v.length_per_key(),
                values=v.values().pin_memory(),
                key_dim=v.key_dim(),
            )
        sequence_dense_features = {}
        for k, v in self.sequence_dense_features.items():
            weights = v._weights
            lengths = v._lengths
            offsets = v._offsets
            sequence_dense_features[k] = JaggedTensor(
                values=v.values().pin_memory(),
                weights=weights.pin_memory() if weights is not None else None,
                lengths=lengths.pin_memory() if lengths is not None else None,
                offsets=offsets.pin_memory() if offsets is not None else None,
            )
        jagged_labels = {}
        for k, v in self.jagged_labels.items():
            weights = v._weights
            lengths = v._lengths
            offsets = v._offsets
            jagged_labels[k] = JaggedTensor(
                values=v.values().pin_memory(),
                weights=weights.pin_memory() if weights is not None else None,
                lengths=lengths.pin_memory() if lengths is not None else None,
                offsets=offsets.pin_memory() if offsets is not None else None,
            )
        return Batch(
            dense_features=dense_features,
            sparse_features={
                k: v.pin_memory() for k, v in self.sparse_features.items()
            },
            sequence_mulval_lengths={
                k: v.pin_memory() for k, v in self.sequence_mulval_lengths.items()
            },
            sequence_dense_features=sequence_dense_features,
            labels={k: v.pin_memory() for k, v in self.labels.items()},
            jagged_labels=jagged_labels,
            reserves=self.reserves,
            tile_size=self.tile_size,
            sample_weights={k: v.pin_memory() for k, v in self.sample_weights.items()},
            additional_infos={
                k: v.pin_memory() for k, v in self.additional_infos.items()
            },
            dummy=self.dummy,
            checkpoint_info=self.checkpoint_info,
        )

    def to_dict(
        self, sparse_dtype: Optional[torch.dtype] = None
    ) -> Dict[str, torch.Tensor]:
        """Convert to feature tensor dict."""
        tensor_dict = {}
        for x in self.dense_features.values():
            for k, v in x.to_dict().items():
                tensor_dict[f"{k}.values"] = v
        for x in self.sparse_features.values():
            if sparse_dtype:
                x = KeyedJaggedTensor(
                    keys=x.keys(),
                    values=x.values().to(sparse_dtype),
                    lengths=x.lengths().to(sparse_dtype),
                    weights=x.weights_or_none(),
                )
            for k, v in x.to_dict().items():
                tensor_dict[f"{k}.values"] = v.values()
                tensor_dict[f"{k}.lengths"] = v.lengths()
                if v.weights_or_none() is not None:
                    tensor_dict[f"{k}.weights"] = v.weights()
        for x in self.sequence_mulval_lengths.values():
            if sparse_dtype:
                x = KeyedJaggedTensor(
                    keys=x.keys(),
                    values=x.values().to(sparse_dtype),
                    lengths=x.lengths().to(sparse_dtype),
                )
            for k, v in x.to_dict().items():
                tensor_dict[f"{k}.key_lengths"] = v.values()
                tensor_dict[f"{k}.lengths"] = v.lengths()
        for k, v in self.sequence_dense_features.items():
            tensor_dict[f"{k}.values"] = v.values()
            tensor_dict[f"{k}.lengths"] = v.lengths()
        for k, v in self.labels.items():
            tensor_dict[f"{k}"] = v
        for k, v in self.jagged_labels.items():
            tensor_dict[f"{k}.values"] = v.values()
            tensor_dict[f"{k}.lengths"] = v.lengths()
        for k, v in self.sample_weights.items():
            tensor_dict[f"{k}"] = v
        if self.tile_size > 0:
            tensor_dict["batch_size"] = torch.tensor(self.tile_size, dtype=torch.int64)

        for k, v in self.additional_infos.items():
            tensor_dict[f"{k}"] = v

        return tensor_dict


def combine_neg_as_candidate_sequence(
    pos_data: pa.Array,
    neg_data: pa.Array,
    neg_sample_num: int,
    seq_delim: str,
) -> pa.Array:
    """Combine positive and negative items into candidate sequences.

    For each sample, joins the positive item with its negative items using the
    sequence delimiter. Used when candidate features are sequence_id_features
    in a JAGGED_SEQUENCE group.

    Args:
        pos_data: positive item IDs, one per sample. Shape: (B,).
        neg_data: negative item IDs. Shape: (B * neg_sample_num,).
        neg_sample_num: number of negative samples per positive.
        seq_delim: delimiter for joining items into a sequence string.

    Returns:
        pa.Array of strings, each containing "pos;neg1;neg2;..." per sample.

    Example:
        pos_data = ["1", "2"]
        neg_data = ["3", "4", "5", "6"]
        neg_sample_num = 2
        seq_delim = ";"
        result = ["1;3;4", "2;5;6"]
    """
    neg_str = neg_data.cast(pa.string())
    neg_offsets = pa.array(
        np.concatenate(
            [
                np.array([0]),
                np.arange(neg_sample_num, len(neg_str) + 1, neg_sample_num),
            ]
        )
    )
    neg_lists = pa.ListArray.from_arrays(neg_offsets, neg_str)
    neg_joined = pc.binary_join(neg_lists, seq_delim)
    pos_str = pos_data.cast(pa.string())
    return pc.binary_join_element_wise(pos_str, neg_joined, seq_delim)


def calc_slice_position(
    row_count: int,
    slice_id: int,
    slice_count: int,
    batch_size: int,
    drop_redundant_bs_eq_one: bool,
    pre_total_remain: int = 0,
) -> Tuple[int, int, int]:
    """Calc table read position according to the slice information.

    Args:
        row_count (int): table total row count.
        slice_id (int): worker id.
        slice_count (int): total worker number.
        batch_size (int): batch_size.
        drop_redundant_bs_eq_one (bool): drop last redundant batch with batch_size
            equal one to prevent train_eval hung.
        pre_total_remain (int): remaining total count in pre-table is
            insufficient to meet the batch_size requirement for each worker.

    Return:
        start (int): start row position in table.
        end (int): start row position in table.
        total_remain (int): remaining total count in curr-table is
            insufficient to meet the batch_size requirement for each worker.
    """
    pre_remain_size = int(pre_total_remain / slice_count)
    pre_remain_split_point = pre_total_remain % slice_count

    size = int((row_count + pre_total_remain) / slice_count)
    split_point = (row_count + pre_total_remain) % slice_count
    if slice_id < split_point:
        start = slice_id * (size + 1)
        end = start + (size + 1)
    else:
        start = split_point * (size + 1) + (slice_id - split_point) * size
        end = start + size

    real_start = (
        start - pre_remain_size * slice_id - min(pre_remain_split_point, slice_id)
    )
    real_end = (
        end
        - pre_remain_size * (slice_id + 1)
        - min(pre_remain_split_point, slice_id + 1)
    )
    # when (end - start) % bz = 1 on some workers and
    # (end - start) % bz = 0 on other workers, train_eval will hang
    if (
        drop_redundant_bs_eq_one
        and split_point != 0
        and (end - start) % batch_size == 1
        and size % batch_size == 0
    ):
        real_end = real_end - 1
        split_point = 0
    return real_start, real_end, (size % batch_size) * slice_count + split_point


def calc_remaining_intervals(
    checkpoint_state: Optional[Dict[str, int]],
    input_path: str,
    total_rows: int,
) -> List[Tuple[int, int]]:
    """Calculate remaining intervals from checkpoint state.

    The checkpoint key format is `{input_path}:{start}` where `start` is the
    beginning of the worker's range. From sorted starts + total_rows, we can
    infer the original ranges and calculate remaining intervals.

    Args:
        checkpoint_state (dict): dict mapping source_id to max consumed row index.
        input_path (str): the input path to filter checkpoint entries.
        total_rows (int): total number of rows in the dataset.

    Returns:
        List of (start, end) tuples representing remaining intervals.
    """
    if not checkpoint_state:
        return [(0, total_rows)]  # No checkpoint, all data remaining

    # Parse checkpoint keys: "{input_path}:{start}" -> (start, consumed)
    # Filter by input_path matching
    entries = []  # [(start, consumed), ...]
    for key, consumed in checkpoint_state.items():
        last_colon = key.rfind(":")
        if last_colon == -1:
            continue
        key_input_path = key[:last_colon]
        # Match input_path (exact match)
        if key_input_path == input_path:
            start = int(key[last_colon + 1 :])
            entries.append((start, consumed))

    if not entries:
        return [(0, total_rows)]  # No matching checkpoint

    # Sort by start to infer original ranges
    entries.sort(key=lambda x: x[0])

    # Calculate remaining intervals
    remaining = []
    num_entries = len(entries)
    for i, (_, consumed) in enumerate(entries):
        # Infer the end of this worker's range
        if i + 1 < num_entries:
            range_end = entries[i + 1][0]  # Next worker's start
        else:
            range_end = total_rows  # Last worker goes to end

        # Remaining interval is [consumed+1, range_end)
        if consumed + 1 < range_end:
            remaining.append((consumed + 1, range_end))

    return remaining if remaining else []


def calc_slice_intervals(
    total_rows: int,
    worker_id: int,
    num_workers: int,
    batch_size: int = 1,
    drop_redundant_bs_eq_one: bool = False,
    pre_total_remain: int = 0,
    checkpoint_state: Optional[Dict[str, int]] = None,
    input_path: Optional[str] = None,
) -> List[Tuple[int, int]]:
    """Redistribute remaining intervals among workers.

    Flattens all intervals into a total row count, then assigns a portion
    to each worker based on worker_id and num_workers.

    Args:
        total_rows (int): total number of rows in the dataset.
        worker_id: Current worker's ID (0-indexed).
        num_workers: Total number of workers.
        batch_size: batch_size.
        drop_redundant_bs_eq_one: drop last redundant batch with batch_size
            equal one to prevent train_eval hung.
        pre_total_remain (int): remaining total count in pre-table is
            insufficient to meet the batch_size requirement for each worker.
        checkpoint_state (dict): dict mapping source_id to max consumed row index.
        input_path (str): the input path to filter checkpoint entries.

    Returns:
        worker_intervals (list): List of (start, end) tuples assigned to this worker.
        total_remain (int): remaining total count in curr-table is
            insufficient to meet the batch_size requirement for each worker.
    """
    if checkpoint_state:
        intervals = calc_remaining_intervals(checkpoint_state, input_path, total_rows)
        total_rows = sum(end - start for start, end in intervals)

    # Reuse calc_slice_position for worker start/end calculation
    worker_start, worker_end, total_remain = calc_slice_position(
        row_count=total_rows,
        slice_id=worker_id,
        slice_count=num_workers,
        batch_size=batch_size,
        drop_redundant_bs_eq_one=drop_redundant_bs_eq_one,
        pre_total_remain=pre_total_remain,
    )

    if checkpoint_state:
        # Map worker's logical range [worker_start, worker_end) to actual intervals
        result = []
        current_pos = 0
        for interval_start, interval_end in intervals:
            interval_len = interval_end - interval_start
            interval_logical_start = current_pos
            interval_logical_end = current_pos + interval_len

            # Check if this interval overlaps with worker's range
            overlap_start = max(worker_start, interval_logical_start)
            overlap_end = min(worker_end, interval_logical_end)

            if overlap_start < overlap_end:
                # Map back to actual row indices
                actual_start = interval_start + (overlap_start - interval_logical_start)
                actual_end = interval_start + (overlap_end - interval_logical_start)
                result.append((actual_start, actual_end))

            current_pos = interval_logical_end
            if current_pos >= worker_end:
                break
    else:
        result = [(worker_start, worker_end)]

    return result, total_remain


def remove_nullable(field_type: pa.DataType) -> pa.DataType:
    """Recursive removal of the null=False property from lists and nested lists."""
    if pa.types.is_list(field_type):
        # Get element fields
        value_field = field_type.value_field
        # Change the nullable to True
        normalized_value_field = value_field.with_nullable(True)
        # Recursive processing of element types
        normalized_value_type = remove_nullable(normalized_value_field.type)
        # Construct a new list type
        return pa.list_(normalized_value_type)

    else:
        return field_type