Loading...
Loading...
Compare original and translation side by side
cp_comm_type="a2a+p2p"hierarchical_context_parallel_sizesa2a+p2pa2ap2pcp_comm_type="a2a+p2p"hierarchical_context_parallel_sizesa2a+p2pa2ap2pcfg.model.context_parallel_size = 4
cfg.model.cp_comm_type = "a2a+p2p"
cfg.model.hierarchical_context_parallel_sizes = [2, 2]
cfg.dist.use_decentralized_pg = Falseprod(hierarchical_context_parallel_sizes) == context_parallel_sizeseq_length % (2 * context_parallel_size) == 0>= 1.12.0cfg.model.context_parallel_size = 4
cfg.model.cp_comm_type = "a2a+p2p"
cfg.model.hierarchical_context_parallel_sizes = [2, 2]
cfg.dist.use_decentralized_pg = Falseprod(hierarchical_context_parallel_sizes) == context_parallel_sizeseq_length % (2 * context_parallel_size) == 0>= 1.12.0context_parallel_size: int = 1
"""Splits network input along sequence dimension across GPU ranks."""
hierarchical_context_parallel_sizes: Optional[list[int]] = None
"""Degrees of the hierarchical context parallelism. Users should provide a list to specify
the sizes for different levels. Taking the a2a+p2p cp comm type as example, it contains
groups of two levels, so the first value of the list indicates the group size of the a2a
communication type, and the second value indicates the group size of the p2p communication
type.
"""if args.hierarchical_context_parallel_sizes:
from numpy import prod
assert args.context_parallel_size == prod(args.hierarchical_context_parallel_sizes)
if "a2a+p2p" in args.cp_comm_type:
assert args.hierarchical_context_parallel_sizes is not None, \
"--hierarchical-context-parallel-sizes must be set when a2a+p2p is used in cp comm"parallel_state.initialize_model_parallel(
...
context_parallel_size=model_config.context_parallel_size,
hierarchical_context_parallel_sizes=model_config.hierarchical_context_parallel_sizes,
...
)
...
return ProcessGroupCollection.use_mpu_process_groups()pg_collection = ProcessGroupCollection(
...
cp=cp_pg,
tp_cp=tp_cp_pg,
hcp=None,
ep=ep_pg,
...
)context_parallel_size: int = 1
"""Splits network input along sequence dimension across GPU ranks."""
hierarchical_context_parallel_sizes: Optional[list[int]] = None
"""Degrees of the hierarchical context parallelism. Users should provide a list to specify
the sizes for different levels. Taking the a2a+p2p cp comm type as example, it contains
groups of two levels, so the first value of the list indicates the group size of the a2a
communication type, and the second value indicates the group size of the p2p communication
type.
"""if args.hierarchical_context_parallel_sizes:
from numpy import prod
assert args.context_parallel_size == prod(args.hierarchical_context_parallel_sizes)
if "a2a+p2p" in args.cp_comm_type:
assert args.hierarchical_context_parallel_sizes is not None, \
"--hierarchical-context-parallel-sizes must be set when a2a+p2p is used in cp comm"parallel_state.initialize_model_parallel(
...
context_parallel_size=model_config.context_parallel_size,
hierarchical_context_parallel_sizes=model_config.hierarchical_context_parallel_sizes,
...
)
...
return ProcessGroupCollection.use_mpu_process_groups()pg_collection = ProcessGroupCollection(
...
cp=cp_pg,
tp_cp=tp_cp_pg,
hcp=None,
ep=ep_pg,
...
)hierarchical_context_parallel_sizesModelParallelConfigundefinedhierarchical_context_parallel_sizesModelParallelConfigundefined
`cp_comm_type` is declared in `TransformerConfig`:
`cp_comm_type`在`TransformerConfig`中声明:
undefinedundefinedTransformerConfig.__post_init__a2a+p2pTransformerConfig.__post_init__a2a+p2pparallel_state.initialize_model_parallelcreate_hierarchical_groupsProcessGroupCollectioncreate_hierarchical_groupsparallel_state.initialize_model_parallelProcessGroupCollectionTEDotProductAttentiona2a+p2pa2a+p2pTEDotProductAttentionuse_decentralized_pg=Truehierarchical_context_parallel_sizesa2a+p2phierarchical_context_parallel_sizesprod(hierarchical_context_parallel_sizes)context_parallel_sizeHIERARCHICAL_CONTEXT_PARALLEL_GROUPSCONTEXT_PARALLEL_GROUPuse_decentralized_pg=Truehierarchical_context_parallel_sizesa2a+p2phierarchical_context_parallel_sizeshierarchical_context_parallel_sizescontext_parallel_sizeHIERARCHICAL_CONTEXT_PARALLEL_GROUPSCONTEXT_PARALLEL_GROUPfollow_up_validationuv run python -m pytest tests/unit_tests/training/test_decentralized_pg.py -qcp_comm_type=a2a+p2phierarchical_context_parallel_sizes=[2,2]CUDA_VISIBLE_DEVICES=0,1,2,3 uv run python -m torch.distributed.run --nproc_per_node=4 \
scripts/training/run_recipe.py \
--recipe llama32_1b_pretrain_config \
model.context_parallel_size=4 \
model.cp_comm_type=a2a+p2p \
"model.hierarchical_context_parallel_sizes=[2,2]" \
train.train_iters=2HIERARCHICAL_CONTEXT_PARALLEL_GROUPSCONTEXT_PARALLEL_GROUPfollow_up_validationuv run python -m pytest tests/unit_tests/training/test_decentralized_pg.py -qcp_comm_type=a2a+p2phierarchical_context_parallel_sizes=[2,2]CUDA_VISIBLE_DEVICES=0,1,2,3 uv run python -m torch.distributed.run --nproc_per_node=4 \
scripts/training/run_recipe.py \
--recipe llama32_1b_pretrain_config \
model.context_parallel_size=4 \
model.cp_comm_type=a2a+p2p \
"model.hierarchical_context_parallel_sizes=[2,2]" \
train.train_iters=2HIERARCHICAL_CONTEXT_PARALLEL_GROUPSCONTEXT_PARALLEL_GROUP