Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions docs/source/feature/dynamicemb.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ DynamicEmbedding 是特征零Hash冲突Id化的一种方式,它相比设置`ha

注:目前使用DynamicEmbedding还处于实验阶段,配置和接口都可能调整,暂只支持训练和评估,暂不包含在官方提供的镜像环境中,使用前需要额外安装如下whl包

注:同一个 FeatureGroup 中若存在多个配置了 DynamicEmbedding 的特征,底层 dynamicemb 会自动将这些表融合到同一份存储里(table fusion),共享 cache/admission counter,降低显存占用并减少内存碎片,无需额外配置。

```bash
# DEVICE 可选: cu126/cu129 (支持 Python 3.10/3.11/3.12)
pip install dynamicemb-0.0.1+20260331.bea6b4b.${DEVICE} -f https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/dynamicemb/${DEVICE}/repo.html
pip install dynamicemb-0.0.1+20260407.97b80bf.${DEVICE} -f https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/dynamicemb/${DEVICE}/repo.html
```

以id_feature的配置为例,DynamicEmbedding 只需在id_feature新增一个dynamicemb的配置字段
Expand Down Expand Up @@ -34,11 +36,14 @@ feature_configs {

- **max_capacity**: 最大的id数,Id数超过后会根据Id的驱逐策略进行淘汰

- **score_strategy**: Id驱逐策略,默认为 STEP,目前支持 TIMESTAMP | STEP | LFU
- **score_strategy**: Id驱逐策略,默认为 STEP,目前支持 TIMESTAMP | STEP | LFU | NO_EVICTION

- TIMESTAMP: 每个Id根据最近更新的时间戳,驱逐时间戳最小的Id
- STEP: 每个Id根据最近更新的迭代步数,驱逐步数最早的Id
- LFU: 每个Id根据出现的频次,驱逐频次小的Id
- NO_EVICTION: 不驱逐,表容量到达 `max_capacity` 后不再接收新Id,适合配合 `init_capacity_per_rank` 做可扩容的表

- **bucket_capacity**: (可选)dynamicemb 哈希表 bucket 大小,默认为 128,增大可提升表的装填率但会增加每次查表的探测代价

- **initializer_args**: 参数初始化设置,默认是 UNIFORM

Expand Down
6 changes: 3 additions & 3 deletions requirements/extra.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260331.bea6b4b.cu129-cp310-cp310-linux_x86_64.whl ; python_version=="3.10"
dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260331.bea6b4b.cu129-cp311-cp311-linux_x86_64.whl ; python_version=="3.11"
dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260331.bea6b4b.cu129-cp312-cp312-linux_x86_64.whl ; python_version=="3.12"
dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260407.97b80bf.cu129-cp310-cp310-linux_x86_64.whl ; python_version=="3.10"
dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260407.97b80bf.cu129-cp311-cp311-linux_x86_64.whl ; python_version=="3.11"
dynamicemb @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/dynamicemb/cu129/dynamicemb-0.0.1%2B20260407.97b80bf.cu129-cp312-cp312-linux_x86_64.whl ; python_version=="3.12"
torch_fx_tool @ https://tzrec.oss-accelerate.aliyuncs.com/third_party/rtp/torch_fx_tool-0.0.1%2B20251201.8c109c4-py3-none-any.whl
6 changes: 5 additions & 1 deletion tzrec/protos/feature.proto
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ message DynamicEmbedding {
// the initializer args for evaluation mode. default is
// default is constant initialization with value 0.0.
optional DynamicEmbInitializerArgs eval_initializer_args = 2;
// strategy to set the score for each indices in forward and backward per table. TIMESTAMP | STEP | CUSTOMIZED | LFU
// strategy to set the score for each indices in forward and backward per table. TIMESTAMP | STEP | CUSTOMIZED | LFU | NO_EVICTION
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: The comment lists CUSTOMIZED as a valid score_strategy, but the user-facing docs intentionally omit it (it's an internal dynamicemb enum value, not meant for end users). Consider removing CUSTOMIZED from this comment to avoid confusion for anyone reading the proto directly.

Suggested change
// strategy to set the score for each indices in forward and backward per table. TIMESTAMP | STEP | CUSTOMIZED | LFU | NO_EVICTION
// strategy to set the score for each indices in forward and backward per table. TIMESTAMP | STEP | LFU | NO_EVICTION

optional string score_strategy = 4 [default = "STEP"];
// max number of embedding rows
required uint64 max_capacity = 5;
Expand All @@ -104,6 +104,10 @@ message DynamicEmbedding {
optional uint64 init_capacity_per_rank = 7;
// init table path
optional string init_table = 8;
// hash-table bucket capacity. default 128 (matches dynamicemb
// DEFAULT_BUCKET_CAPACITY). larger buckets trade probe cost for
// higher load factor.
optional uint64 bucket_capacity = 9;
oneof admission_strategy {
DynamicEmbFrequencyAdmissionStrategy frequency_admission_strategy = 100;
}
Expand Down
69 changes: 18 additions & 51 deletions tzrec/utils/dynamicemb_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from torchrec.distributed.embedding_types import (
EmbeddingComputeKernel,
GroupedEmbeddingConfig,
ShardedEmbeddingTable,
)
from torchrec.distributed.planner import (
constants,
Expand All @@ -47,7 +46,7 @@
ShardingType,
ShardMetadata,
)
from torchrec.modules.embedding_configs import BaseEmbeddingConfig, DataType
from torchrec.modules.embedding_configs import BaseEmbeddingConfig

from tzrec.protos import feature_pb2

Expand All @@ -61,7 +60,6 @@
FrequencyAdmissionStrategy,
KVCounter,
align_to_table_size,
batched_dynamicemb_compute_kernel,
)
from dynamicemb.batched_dynamicemb_compute_kernel import (
BatchedDynamicEmbedding,
Expand Down Expand Up @@ -191,6 +189,10 @@ def build_dynamicemb_constraints(
else:
raise ValueError(f"Unknown AdmissionStrategy: {admission_strategy_type}")

demb_opt_kwargs = {}
if dynamicemb_cfg.HasField("bucket_capacity"):
demb_opt_kwargs["bucket_capacity"] = dynamicemb_cfg.bucket_capacity
Comment on lines +192 to +194
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: validate bucket_capacity > 0

A user could set bucket_capacity: 0 in the proto config, which would cause a ZeroDivisionError downstream in _calculate_dynamicemb_table_storage_specific_size (the 4 / bucket_capacity expression at line 295). Consider adding a guard:

Suggested change
demb_opt_kwargs = {}
if dynamicemb_cfg.HasField("bucket_capacity"):
demb_opt_kwargs["bucket_capacity"] = dynamicemb_cfg.bucket_capacity
demb_opt_kwargs = {}
if dynamicemb_cfg.HasField("bucket_capacity"):
if dynamicemb_cfg.bucket_capacity == 0:
raise ValueError("bucket_capacity must be > 0")
demb_opt_kwargs["bucket_capacity"] = dynamicemb_cfg.bucket_capacity


dynamicemb_options = dynamicemb.DynamicEmbTableOptions(
max_capacity=dynamicemb_cfg.max_capacity,
init_capacity=init_capacity,
Expand All @@ -207,6 +209,7 @@ def build_dynamicemb_constraints(
score_strategy=score_strategy,
admit_strategy=admit_strategy,
admission_counter=admission_counter,
**demb_opt_kwargs,
)

constraints_kwargs = {}
Expand Down Expand Up @@ -352,18 +355,18 @@ def _to_sharding_plan(
bucket_capacity=dynamicemb_options.bucket_capacity,
)
)

# align to DEMB_TABLE_ALIGN_SIZE
num_aligned_embedding_per_rank = align_to_table_size(shards[0].size[0])
num_embeddings_per_shard = shards[0].size[0]
if num_aligned_embedding_per_rank < dynamicemb_options.bucket_capacity:
num_aligned_embedding_per_rank = align_to_table_size(
dynamicemb_options.bucket_capacity
)
if num_embeddings_per_shard != num_aligned_embedding_per_rank:
dynamicemb_options.num_aligned_embedding_per_rank = (
num_aligned_embedding_per_rank
)
# Fill in per-shard fields that used to be populated by
# dynamicemb's internal ``_get_dynamicemb_options_per_table``.
# After the fused-storage refactor (NVIDIA recsys-examples
# PR #343) that upstream function became a pass-through
# validator, so the caller must set ``dim``, ``max_capacity``
# (per-shard row count) and ``embedding_dtype`` directly.
dynamicemb_options.dim = shards[0].size[1]
dynamicemb_options.max_capacity = shards[0].size[0]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: should max_capacity be aligned here?

The storage estimator (_calculate_dynamicemb_table_storage_specific_size at line 288) wraps the shard size through align_to_table_size(size[0]) when computing memory budgets. Here the raw shard size is assigned directly. If the new fused-storage kernel trusts the caller's max_capacity verbatim (as the comment above suggests), this could be an under-allocation vs. the estimator's budget.

Should this be align_to_table_size(shards[0].size[0]) for consistency, or does the kernel align internally?

if dynamicemb_options.embedding_dtype is None:
dynamicemb_options.embedding_dtype = tensor.dtype
if dynamicemb_options.index_type is None:
dynamicemb_options.index_type = torch.int64

module_plan[sharding_option.name] = DynamicEmbParameterSharding(
sharding_spec=sharding_spec,
Expand Down Expand Up @@ -614,42 +617,6 @@ def dynamicemb_calculate_shard_storages(
for hbm_size, ddr_size in zip(hbm_sizes, ddr_sizes)
]

_dynamicemb_get_dynamicemb_options_per_table = (
batched_dynamicemb_compute_kernel._get_dynamicemb_options_per_table
)

def _get_dynamicemb_options_per_table(
local_row: int,
local_col: int,
data_type: DataType,
optimizer: dynamicemb.EmbOptimType,
table: ShardedEmbeddingTable,
) -> dynamicemb.DynamicEmbTableOptions:
# pyre-ignore [16]
dynamicemb_options = table.fused_params["dynamicemb_options"]
bak_local_hbm_for_values = None
if dynamicemb_options.num_aligned_embedding_per_rank is not None:
bak_local_hbm_for_values = dynamicemb_options.local_hbm_for_values

dynamicemb_options = _dynamicemb_get_dynamicemb_options_per_table(
local_row=local_row,
local_col=local_col,
data_type=data_type,
optimizer=optimizer,
table=table,
)

# do not improve the HBM budget, already aligned in planner.
if bak_local_hbm_for_values is not None:
dynamicemb_options.local_hbm_for_values = bak_local_hbm_for_values

return dynamicemb_options

# pyre-ignore [9]
batched_dynamicemb_compute_kernel._get_dynamicemb_options_per_table = (
_get_dynamicemb_options_per_table
)

# Monkey-patch for torchrec 1.5.0 compatibility
# The base class now passes 'env' parameter to _create_embedding_kernel
def _grouped_embeddings_lookup_create_embedding_kernel(
Expand Down
2 changes: 1 addition & 1 deletion tzrec/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "1.1.6"
__version__ = "1.1.7"
Loading