diff --git a/docs/source/feature/dynamicemb.md b/docs/source/feature/dynamicemb.md index 05278721..61f5fd63 100644 --- a/docs/source/feature/dynamicemb.md +++ b/docs/source/feature/dynamicemb.md @@ -4,9 +4,11 @@ DynamicEmbedding 是特征零Hash冲突Id化的一种方式,它相比设置`ha 注:目前使用DynamicEmbedding还处于实验阶段,配置和接口都可能调整,暂只支持训练和评估,暂不包含在官方提供的镜像环境中,使用前需要额外安装如下whl包 +注:同一个 FeatureGroup 中若存在多个配置了 DynamicEmbedding 的特征,底层 dynamicemb 会自动将这些表融合到同一份存储里(table fusion),共享 cache/admission counter,降低显存占用并减少内存碎片,无需额外配置。 + ```bash # DEVICE 可选: cu126/cu129 (支持 Python 3.10/3.11/3.12) -pip install dynamicemb-0.0.1+20260331.bea6b4b.${DEVICE} -f https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/dynamicemb/${DEVICE}/repo.html +pip install dynamicemb-0.0.1+20260407.97b80bf.${DEVICE} -f https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/dynamicemb/${DEVICE}/repo.html ``` 以id_feature的配置为例,DynamicEmbedding 只需在id_feature新增一个dynamicemb的配置字段 @@ -34,11 +36,14 @@ feature_configs { - **max_capacity**: 最大的id数,Id数超过后会根据Id的驱逐策略进行淘汰 -- **score_strategy**: Id驱逐策略,默认为 STEP,目前支持 TIMESTAMP | STEP | LFU +- **score_strategy**: Id驱逐策略,默认为 STEP,目前支持 TIMESTAMP | STEP | LFU | NO_EVICTION - TIMESTAMP: 每个Id根据最近更新的时间戳,驱逐时间戳最小的Id - STEP: 每个Id根据最近更新的迭代步数,驱逐步数最早的Id - LFU: 每个Id根据出现的频次,驱逐频次小的Id + - NO_EVICTION: 不驱逐,表容量到达 `max_capacity` 后不再接收新Id,适合配合 `init_capacity_per_rank` 做可扩容的表 + +- **bucket_capacity**: (可选)dynamicemb 哈希表 bucket 大小,默认为 128,增大可提升表的装填率但会增加每次查表的探测代价 - **initializer_args**: 参数初始化设置,默认是 UNIFORM diff --git a/requirements/extra.txt b/requirements/extra.txt index 4a50b4c8..76086107 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -1,4 +1,4 @@ -dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260331.bea6b4b.cu129-cp310-cp310-linux_x86_64.whl ; python_version=="3.10" -dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260331.bea6b4b.cu129-cp311-cp311-linux_x86_64.whl ; python_version=="3.11" -dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260331.bea6b4b.cu129-cp312-cp312-linux_x86_64.whl ; python_version=="3.12" +dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260407.97b80bf.cu129-cp310-cp310-linux_x86_64.whl ; python_version=="3.10" +dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260407.97b80bf.cu129-cp311-cp311-linux_x86_64.whl ; python_version=="3.11" +dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260407.97b80bf.cu129-cp312-cp312-linux_x86_64.whl ; python_version=="3.12" torch_fx_tool @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/rtp/torch_fx_tool-0.0.1%2B20251201.8c109c4-py3-none-any.whl diff --git a/tzrec/protos/feature.proto b/tzrec/protos/feature.proto index c5bd99d8..29897b0d 100644 --- a/tzrec/protos/feature.proto +++ b/tzrec/protos/feature.proto @@ -94,7 +94,7 @@ message DynamicEmbedding { // the initializer args for evaluation mode. default is // default is constant initialization with value 0.0. optional DynamicEmbInitializerArgs eval_initializer_args = 2; - // strategy to set the score for each indices in forward and backward per table. TIMESTAMP | STEP | CUSTOMIZED | LFU + // strategy to set the score for each indices in forward and backward per table. TIMESTAMP | STEP | CUSTOMIZED | LFU | NO_EVICTION optional string score_strategy = 4 [default = "STEP"]; // max number of embedding rows required uint64 max_capacity = 5; @@ -104,6 +104,10 @@ message DynamicEmbedding { optional uint64 init_capacity_per_rank = 7; // init table path optional string init_table = 8; + // hash-table bucket capacity. default 128 (matches dynamicemb + // DEFAULT_BUCKET_CAPACITY). larger buckets trade probe cost for + // higher load factor. + optional uint64 bucket_capacity = 9; oneof admission_strategy { DynamicEmbFrequencyAdmissionStrategy frequency_admission_strategy = 100; } diff --git a/tzrec/utils/dynamicemb_util.py b/tzrec/utils/dynamicemb_util.py index 621a5dae..9d437f92 100644 --- a/tzrec/utils/dynamicemb_util.py +++ b/tzrec/utils/dynamicemb_util.py @@ -20,7 +20,6 @@ from torchrec.distributed.embedding_types import ( EmbeddingComputeKernel, GroupedEmbeddingConfig, - ShardedEmbeddingTable, ) from torchrec.distributed.planner import ( constants, @@ -47,7 +46,7 @@ ShardingType, ShardMetadata, ) -from torchrec.modules.embedding_configs import BaseEmbeddingConfig, DataType +from torchrec.modules.embedding_configs import BaseEmbeddingConfig from tzrec.protos import feature_pb2 @@ -61,7 +60,6 @@ FrequencyAdmissionStrategy, KVCounter, align_to_table_size, - batched_dynamicemb_compute_kernel, ) from dynamicemb.batched_dynamicemb_compute_kernel import ( BatchedDynamicEmbedding, @@ -191,6 +189,10 @@ def build_dynamicemb_constraints( else: raise ValueError(f"Unknown AdmissionStrategy: {admission_strategy_type}") + demb_opt_kwargs = {} + if dynamicemb_cfg.HasField("bucket_capacity"): + demb_opt_kwargs["bucket_capacity"] = dynamicemb_cfg.bucket_capacity + dynamicemb_options = dynamicemb.DynamicEmbTableOptions( max_capacity=dynamicemb_cfg.max_capacity, init_capacity=init_capacity, @@ -207,6 +209,7 @@ def build_dynamicemb_constraints( score_strategy=score_strategy, admit_strategy=admit_strategy, admission_counter=admission_counter, + **demb_opt_kwargs, ) constraints_kwargs = {} @@ -352,18 +355,18 @@ def _to_sharding_plan( bucket_capacity=dynamicemb_options.bucket_capacity, ) ) - - # align to DEMB_TABLE_ALIGN_SIZE - num_aligned_embedding_per_rank = align_to_table_size(shards[0].size[0]) - num_embeddings_per_shard = shards[0].size[0] - if num_aligned_embedding_per_rank < dynamicemb_options.bucket_capacity: - num_aligned_embedding_per_rank = align_to_table_size( - dynamicemb_options.bucket_capacity - ) - if num_embeddings_per_shard != num_aligned_embedding_per_rank: - dynamicemb_options.num_aligned_embedding_per_rank = ( - num_aligned_embedding_per_rank - ) + # Fill in per-shard fields that used to be populated by + # dynamicemb's internal ``_get_dynamicemb_options_per_table``. + # After the fused-storage refactor (NVIDIA recsys-examples + # PR #343) that upstream function became a pass-through + # validator, so the caller must set ``dim``, ``max_capacity`` + # (per-shard row count) and ``embedding_dtype`` directly. + dynamicemb_options.dim = shards[0].size[1] + dynamicemb_options.max_capacity = shards[0].size[0] + if dynamicemb_options.embedding_dtype is None: + dynamicemb_options.embedding_dtype = tensor.dtype + if dynamicemb_options.index_type is None: + dynamicemb_options.index_type = torch.int64 module_plan[sharding_option.name] = DynamicEmbParameterSharding( sharding_spec=sharding_spec, @@ -614,42 +617,6 @@ def dynamicemb_calculate_shard_storages( for hbm_size, ddr_size in zip(hbm_sizes, ddr_sizes) ] - _dynamicemb_get_dynamicemb_options_per_table = ( - batched_dynamicemb_compute_kernel._get_dynamicemb_options_per_table - ) - - def _get_dynamicemb_options_per_table( - local_row: int, - local_col: int, - data_type: DataType, - optimizer: dynamicemb.EmbOptimType, - table: ShardedEmbeddingTable, - ) -> dynamicemb.DynamicEmbTableOptions: - # pyre-ignore [16] - dynamicemb_options = table.fused_params["dynamicemb_options"] - bak_local_hbm_for_values = None - if dynamicemb_options.num_aligned_embedding_per_rank is not None: - bak_local_hbm_for_values = dynamicemb_options.local_hbm_for_values - - dynamicemb_options = _dynamicemb_get_dynamicemb_options_per_table( - local_row=local_row, - local_col=local_col, - data_type=data_type, - optimizer=optimizer, - table=table, - ) - - # do not improve the HBM budget, already aligned in planner. - if bak_local_hbm_for_values is not None: - dynamicemb_options.local_hbm_for_values = bak_local_hbm_for_values - - return dynamicemb_options - - # pyre-ignore [9] - batched_dynamicemb_compute_kernel._get_dynamicemb_options_per_table = ( - _get_dynamicemb_options_per_table - ) - # Monkey-patch for torchrec 1.5.0 compatibility # The base class now passes 'env' parameter to _create_embedding_kernel def _grouped_embeddings_lookup_create_embedding_kernel( diff --git a/tzrec/version.py b/tzrec/version.py index e9552152..e32c82ac 100644 --- a/tzrec/version.py +++ b/tzrec/version.py @@ -9,4 +9,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.1.6" +__version__ = "1.1.7"