SkyRL
API ReferenceSkyRL

Configuration

Configuration dataclasses for SkyRL training.

Top-Level Config

The root configuration object and helpers.

class SkyRLTrainConfig

SkyRLTrainConfig(data: DataConfig = DataConfig(), trainer: TrainerConfig = TrainerConfig(), generator: GeneratorConfig = GeneratorConfig(), environment: EnvironmentConfig = EnvironmentConfig()) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.
from_cli_overridesConstruct a SkyRLTrainConfig from CLI arguments or a dict of overrides.

Attributes:

Source code in skyrl/train/config/config.py:694-787
@dataclass
class SkyRLTrainConfig(BaseConfig):
    data: DataConfig = field(default_factory=DataConfig)
    trainer: TrainerConfig = field(default_factory=TrainerConfig)
    generator: GeneratorConfig = field(default_factory=GeneratorConfig)
    environment: EnvironmentConfig = field(default_factory=EnvironmentConfig)

    def __post_init__(self):

        # generator.max_input_length defaults to trainer.max_prompt_length
        if self.generator.max_input_length is None:
            self.generator.max_input_length = self.trainer.max_prompt_length

        # generator rope params default to trainer rope params
        if self.generator.rope_scaling is None and self.trainer.rope_scaling is not None:
            self.generator.rope_scaling = self.trainer.rope_scaling
        if self.generator.rope_theta is None and self.trainer.rope_theta is not None:
            self.generator.rope_theta = self.trainer.rope_theta
        # Copy temperature from generator sampling params to algorithm config
        # so workers can access it without needing the generator config
        if self.trainer.algorithm.temperature is None:
            self.trainer.algorithm.temperature = self.generator.sampling_params.temperature

        if self.trainer.algorithm.max_seq_len is None:
            # NOTE (erictang000): this is the max sequence length including the prompt, since max response length
            # per batch can be variable based on the prompt length. This is used to normalize the loss for
            # seq_mean_token_sum_norm loss reduction.
            # TODO(Charlie): This calculation is not correct for multi-turn and users should use `max_seq_len` instead.
            # Should we just force users to set max_seq_len if loss reduction is seq_mean_token_sum_norm, regardless of
            # multi-turn or not?
            self.trainer.algorithm.max_seq_len = (
                self.generator.max_input_length + self.generator.sampling_params.max_generate_length
            )

    @classmethod
    def from_cli_overrides(cls, args: Union[List[str], dict]) -> "SkyRLTrainConfig":
        """Construct a SkyRLTrainConfig from CLI arguments or a dict of overrides.

        Parses CLI arguments and builds a typed config. Dataclass field defaults
        are used for any values not specified on the command line.

        Supports both new-style config paths (e.g., generator.inference_engine.backend)
        and legacy YAML-style paths (e.g., generator.backend) for backward compatibility.

        Args:
            args: Either a list of CLI arguments in 'key.path=value' format, or a dict
                  mapping dot-notation keys to values.
                  Example list: ['trainer.policy.model.path=Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed=123']
                  Example dict: {'trainer.policy.model.path': 'Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed': 123}

        Returns:
            A fully constructed SkyRLTrainConfig with CLI overrides applied.

        Raises:
            ValueError: If an argument uses the unsupported '+' prefix.
        """
        if isinstance(args, dict):
            args = [f"{k}={v}" for k, v in args.items()]

        from skyrl.train.config.legacy import (
            is_legacy_config,
            translate_legacy_config,
            warn_legacy_config,
        )
        from skyrl.train.config.utils import get_legacy_config

        # Check for unsupported '+' prefix
        for arg in args:
            if arg.startswith("+"):
                raise ValueError(
                    f"The '+' prefix for adding new config fields is not supported: '{arg}'. "
                    "To add custom config fields, subclass the relevant config dataclass."
                )
        overrides = OmegaConf.from_cli(args)

        # Try new format first
        try:
            return cls.from_dict_config(overrides)
        except ValueError:
            # Fall back to legacy format: load base YAML, merge overrides, translate
            try:
                base_cfg = get_legacy_config()
                merged = OmegaConf.merge(base_cfg, overrides)
                merged_dict = OmegaConf.to_container(merged, resolve=True)

                if is_legacy_config(merged_dict):
                    warn_legacy_config()
                    translated = translate_legacy_config(merged_dict)
                    return build_nested_dataclass(cls, translated)
            except Exception:
                pass  # Legacy translation failed, re-raise original error

            # Re-raise original error if not a legacy config issue
            raise

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr data

data: DataConfig = field(default_factory=DataConfig)

attr trainer

trainer: TrainerConfig = field(default_factory=TrainerConfig)

attr generator

generator: GeneratorConfig = field(default_factory=GeneratorConfig)

attr environment

environment: EnvironmentConfig = field(default_factory=EnvironmentConfig)

method classmethod from_cli_overrides

from_cli_overrides(args: Union[List[str], dict]) -> SkyRLTrainConfig

Construct a SkyRLTrainConfig from CLI arguments or a dict of overrides.

Parses CLI arguments and builds a typed config. Dataclass field defaults are used for any values not specified on the command line.

Supports both new-style config paths (e.g., generator.inference_engine.backend) and legacy YAML-style paths (e.g., generator.backend) for backward compatibility.

Parameters:

NameTypeDescriptionDefault
argsUnion[List[str], dict]Either a list of CLI arguments in 'key.path=value' format, or a dict mapping dot-notation keys to values. Example list: ['trainer.policy.model.path=Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed=123'] Example dict: {'trainer.policy.model.path': 'Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed': 123}required

Returns:

TypeDescription
SkyRLTrainConfigA fully constructed SkyRLTrainConfig with CLI overrides applied.

Raises:

TypeDescription
ValueErrorIf an argument uses the unsupported '+' prefix.
Source code in skyrl/train/config/config.py:728-787
    @classmethod
    def from_cli_overrides(cls, args: Union[List[str], dict]) -> "SkyRLTrainConfig":
        """Construct a SkyRLTrainConfig from CLI arguments or a dict of overrides.

        Parses CLI arguments and builds a typed config. Dataclass field defaults
        are used for any values not specified on the command line.

        Supports both new-style config paths (e.g., generator.inference_engine.backend)
        and legacy YAML-style paths (e.g., generator.backend) for backward compatibility.

        Args:
            args: Either a list of CLI arguments in 'key.path=value' format, or a dict
                  mapping dot-notation keys to values.
                  Example list: ['trainer.policy.model.path=Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed=123']
                  Example dict: {'trainer.policy.model.path': 'Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed': 123}

        Returns:
            A fully constructed SkyRLTrainConfig with CLI overrides applied.

        Raises:
            ValueError: If an argument uses the unsupported '+' prefix.
        """
        if isinstance(args, dict):
            args = [f"{k}={v}" for k, v in args.items()]

        from skyrl.train.config.legacy import (
            is_legacy_config,
            translate_legacy_config,
            warn_legacy_config,
        )
        from skyrl.train.config.utils import get_legacy_config

        # Check for unsupported '+' prefix
        for arg in args:
            if arg.startswith("+"):
                raise ValueError(
                    f"The '+' prefix for adding new config fields is not supported: '{arg}'. "
                    "To add custom config fields, subclass the relevant config dataclass."
                )
        overrides = OmegaConf.from_cli(args)

        # Try new format first
        try:
            return cls.from_dict_config(overrides)
        except ValueError:
            # Fall back to legacy format: load base YAML, merge overrides, translate
            try:
                base_cfg = get_legacy_config()
                merged = OmegaConf.merge(base_cfg, overrides)
                merged_dict = OmegaConf.to_container(merged, resolve=True)

                if is_legacy_config(merged_dict):
                    warn_legacy_config()
                    translated = translate_legacy_config(merged_dict)
                    return build_nested_dataclass(cls, translated)
            except Exception:
                pass  # Legacy translation failed, re-raise original error

            # Re-raise original error if not a legacy config issue
            raise

method make_config

make_config(algorithm_cls: Optional[Type[AlgorithmConfig]] = None, trainer_cls: Optional[Type[TrainerConfig]] = None, generator_cls: Optional[Type[GeneratorConfig]] = None) -> Type[SkyRLTrainConfig]

Create a SkyRLTrainConfig subclass with custom nested config classes.

Convenience helper to avoid boilerplate when extending configs for custom algorithms or generators. For full IDE autocomplete on custom fields, use explicit subclassing instead (see examples/algorithms/dapo/main_dapo.py).

Parameters:

NameTypeDescriptionDefault
algorithm_clsOptional[Type[AlgorithmConfig]]Custom AlgorithmConfig subclass. If provided without trainer_cls, a TrainerConfig subclass is automatically created.None
trainer_clsOptional[Type[TrainerConfig]]Custom TrainerConfig subclass. Takes precedence over algorithm_cls for the trainer config.None
generator_clsOptional[Type[GeneratorConfig]]Custom GeneratorConfig subclass.None

Returns:

TypeDescription
Type[SkyRLTrainConfig]A SkyRLTrainConfig subclass wired up with the custom config classes.

Example::

@dataclass
class MyAlgorithmConfig(AlgorithmConfig):
    my_param: int = 42

MyConfig = make_config(algorithm_cls=MyAlgorithmConfig)
cfg = MyConfig.from_cli_overrides(sys.argv[1:])

method get_config_as_dict

get_config_as_dict(cfg: Union[dict, BaseConfig]) -> dict

method get_config_as_yaml_str

get_config_as_yaml_str(cfg: BaseConfig) -> str

Data & Model

class DataConfig

DataConfig(train_data: List[str] = (lambda: [os.path.expanduser('~/data/gsm8k/train.parquet')])(), val_data: List[str] = (lambda: [os.path.expanduser('~/data/gsm8k/validation.parquet')])()) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
train_dataList[str]
val_dataList[str]
Source code in skyrl/train/config/config.py:40-43
@dataclass
class DataConfig(BaseConfig):
    train_data: List[str] = field(default_factory=lambda: [os.path.expanduser("~/data/gsm8k/train.parquet")])
    val_data: List[str] = field(default_factory=lambda: [os.path.expanduser("~/data/gsm8k/validation.parquet")])

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr train_data

train_data: List[str] = field(default_factory=(lambda: [os.path.expanduser('~/data/gsm8k/train.parquet')]))

attr val_data

val_data: List[str] = field(default_factory=(lambda: [os.path.expanduser('~/data/gsm8k/validation.parquet')]))

class ModelConfig

ModelConfig(path: Optional[str] = None, lora: SkyRLLoraConfig = SkyRLLoraConfig()) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
pathOptional[str]
loraSkyRLLoraConfig
Source code in skyrl/train/config/config.py:65-68
@dataclass
class ModelConfig(BaseConfig):
    path: Optional[str] = None
    lora: SkyRLLoraConfig = field(default_factory=SkyRLLoraConfig)

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr path

path: Optional[str] = None

attr lora

lora: SkyRLLoraConfig = field(default_factory=SkyRLLoraConfig)

class SkyRLLoraConfig

SkyRLLoraConfig(rank: int = 0, alpha: int = 16, dropout: float = 0.0, lora_sync_path: str = '/tmp/skyrl_lora_sync', target_modules: str = 'all-linear', exclude_modules: Optional[str] = None, init_method: str = 'kaiming') -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
rankint
alphaint
dropoutfloat
lora_sync_pathstr
target_modulesstr
exclude_modulesOptional[str]
init_methodstrFor FSDP, corresponds to init_lora_weights in PEFT.
Source code in skyrl/train/config/config.py:52-62
@dataclass
class SkyRLLoraConfig(BaseConfig):
    rank: int = 0
    alpha: int = 16
    dropout: float = 0.0
    lora_sync_path: str = "/tmp/skyrl_lora_sync"
    target_modules: str = "all-linear"
    exclude_modules: Optional[str] = None
    init_method: str = "kaiming"
    """For FSDP, corresponds to ``init_lora_weights`` in PEFT.
    For Megatron, used for ``lora_A_init_method``; supports "xavier", "normal", "kaiming", "zero"."""

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr rank

rank: int = 0

attr alpha

alpha: int = 16

attr dropout

dropout: float = 0.0

attr lora_sync_path

lora_sync_path: str = '/tmp/skyrl_lora_sync'

attr target_modules

target_modules: str = 'all-linear'

attr exclude_modules

exclude_modules: Optional[str] = None

attr init_method

init_method: str = 'kaiming'

For FSDP, corresponds to init_lora_weights in PEFT. For Megatron, used for lora_A_init_method; supports "xavier", "normal", "kaiming", "zero".

Training

class TrainerConfig

TrainerConfig(placement: PlacementConfig = PlacementConfig(), sequence_parallel_backend: str = 'ulysses', strategy: str = 'fsdp2', policy: PolicyConfig = PolicyConfig(), ref: RefConfig = RefConfig(), critic: CriticConfig = CriticConfig(), algorithm: AlgorithmConfig = AlgorithmConfig(), fully_async: FullyAsyncConfig = FullyAsyncConfig(), gradient_checkpointing: bool = True, gradient_checkpointing_use_reentrant: bool = False, seed: int = 42, resume_mode: Optional[str] = 'latest', resume_path: Optional[str] = None, log_path: str = '/tmp/skyrl-logs', ckpt_path: str = (lambda: os.path.expanduser('~/ckpts/'))(), max_ckpts_to_keep: int = -1, ckpt_interval: int = 10, hf_save_interval: int = -1, export_path: str = (lambda: os.path.expanduser('~/exports/'))(), bf16: bool = True, epochs: int = 1, update_epochs_per_batch: int = 1, train_batch_size: int = 1024, policy_mini_batch_size: int = 256, critic_mini_batch_size: int = 256, micro_train_batch_size_per_gpu: int = 1, micro_forward_batch_size_per_gpu: int = 1, update_ref_every_epoch: bool = False, use_sample_packing: bool = True, eval_batch_size: int = 1024, eval_before_train: bool = True, eval_interval: int = 5, max_prompt_length: int = 512, flash_attn: bool = True, disable_fast_tokenizer: bool = False, project_name: str = 'skyrl', run_name: str = 'test_run', logger: str = 'wandb', dump_data_batch: bool = False, dump_eval_results: bool = True, rope_scaling: Optional[Dict[str, Any]] = None, rope_theta: Optional[float] = None) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
placementPlacementConfig
sequence_parallel_backendstr
strategystr
policyPolicyConfig
refRefConfig
criticCriticConfig
algorithmAlgorithmConfig
fully_asyncFullyAsyncConfig
gradient_checkpointingbool
gradient_checkpointing_use_reentrantbool
seedint
resume_modeOptional[str]None/"none", "latest", or "from_path".
resume_pathOptional[str]
log_pathstrPath for infrastructure log files. For multi-node, use a shared filesystem path to consolidate logs.
ckpt_pathstr
max_ckpts_to_keepint-1 to keep all checkpoints, N to keep only the last N.
ckpt_intervalint
hf_save_intervalintSave HuggingFace-format model every N steps. -1 to disable.
export_pathstrPath for exported artifacts (HF models, debug dumps, etc.).
bf16bool
epochsint
update_epochs_per_batchintNumber of gradient update passes over each training batch.
train_batch_sizeintSee utils/utils.py::validate_batch_sizes for train, mini, and micro batch size constraints.
policy_mini_batch_sizeint
critic_mini_batch_sizeint
micro_train_batch_size_per_gpuint
micro_forward_batch_size_per_gpuint
update_ref_every_epochbool
use_sample_packingbool
eval_batch_sizeint
eval_before_trainbool
eval_intervalint-1 to disable evaluation.
max_prompt_lengthint
flash_attnbool
disable_fast_tokenizerbool
project_namestr
run_namestr
loggerstr
dump_data_batchbool
dump_eval_resultsbool
rope_scalingOptional[Dict[str, Any]]
rope_thetaOptional[float]
Source code in skyrl/train/config/config.py:558-614
@dataclass
class TrainerConfig(BaseConfig):
    placement: PlacementConfig = field(default_factory=PlacementConfig)
    sequence_parallel_backend: str = "ulysses"
    strategy: str = "fsdp2"
    policy: PolicyConfig = field(default_factory=PolicyConfig)
    ref: RefConfig = field(default_factory=RefConfig)
    critic: CriticConfig = field(default_factory=CriticConfig)
    algorithm: AlgorithmConfig = field(default_factory=AlgorithmConfig)
    fully_async: FullyAsyncConfig = field(default_factory=FullyAsyncConfig)
    gradient_checkpointing: bool = True
    gradient_checkpointing_use_reentrant: bool = False
    seed: int = 42
    resume_mode: Optional[str] = "latest"
    """``None``/``"none"``, ``"latest"``, or ``"from_path"``."""
    resume_path: Optional[str] = None
    log_path: str = "/tmp/skyrl-logs"
    """Path for infrastructure log files. For multi-node, use a shared filesystem path to consolidate logs."""
    ckpt_path: str = field(default_factory=lambda: os.path.expanduser("~/ckpts/"))
    max_ckpts_to_keep: int = -1
    """``-1`` to keep all checkpoints, ``N`` to keep only the last N."""
    ckpt_interval: int = 10
    hf_save_interval: int = -1
    """Save HuggingFace-format model every N steps. ``-1`` to disable."""
    export_path: str = field(default_factory=lambda: os.path.expanduser("~/exports/"))
    """Path for exported artifacts (HF models, debug dumps, etc.)."""
    bf16: bool = True
    epochs: int = 1
    update_epochs_per_batch: int = 1
    """Number of gradient update passes over each training batch."""
    train_batch_size: int = 1024
    """See ``utils/utils.py::validate_batch_sizes`` for train, mini, and micro batch size constraints."""
    policy_mini_batch_size: int = 256
    critic_mini_batch_size: int = 256
    micro_train_batch_size_per_gpu: int = 1
    micro_forward_batch_size_per_gpu: int = 1
    update_ref_every_epoch: bool = False
    use_sample_packing: bool = True
    eval_batch_size: int = 1024
    eval_before_train: bool = True
    eval_interval: int = 5
    """``-1`` to disable evaluation."""
    max_prompt_length: int = 512
    flash_attn: bool = True
    disable_fast_tokenizer: bool = False
    project_name: str = "skyrl"
    run_name: str = "test_run"
    logger: str = "wandb"
    dump_data_batch: bool = False
    dump_eval_results: bool = True
    rope_scaling: Optional[Dict[str, Any]] = None
    rope_theta: Optional[float] = None

    def __post_init__(self):
        # ref model defaults to the policy model
        if self.ref.model.path is None:
            self.ref.model.path = self.policy.model.path

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr placement

placement: PlacementConfig = field(default_factory=PlacementConfig)

attr sequence_parallel_backend

sequence_parallel_backend: str = 'ulysses'

attr strategy

strategy: str = 'fsdp2'

attr policy

policy: PolicyConfig = field(default_factory=PolicyConfig)

attr ref

ref: RefConfig = field(default_factory=RefConfig)

attr critic

critic: CriticConfig = field(default_factory=CriticConfig)

attr algorithm

algorithm: AlgorithmConfig = field(default_factory=AlgorithmConfig)

attr fully_async

fully_async: FullyAsyncConfig = field(default_factory=FullyAsyncConfig)

attr gradient_checkpointing

gradient_checkpointing: bool = True

attr gradient_checkpointing_use_reentrant

gradient_checkpointing_use_reentrant: bool = False

attr seed

seed: int = 42

attr resume_mode

resume_mode: Optional[str] = 'latest'

None/"none", "latest", or "from_path".

attr resume_path

resume_path: Optional[str] = None

attr log_path

log_path: str = '/tmp/skyrl-logs'

Path for infrastructure log files. For multi-node, use a shared filesystem path to consolidate logs.

attr ckpt_path

ckpt_path: str = field(default_factory=(lambda: os.path.expanduser('~/ckpts/')))

attr max_ckpts_to_keep

max_ckpts_to_keep: int = -1

-1 to keep all checkpoints, N to keep only the last N.

attr ckpt_interval

ckpt_interval: int = 10

attr hf_save_interval

hf_save_interval: int = -1

Save HuggingFace-format model every N steps. -1 to disable.

attr export_path

export_path: str = field(default_factory=(lambda: os.path.expanduser('~/exports/')))

Path for exported artifacts (HF models, debug dumps, etc.).

attr bf16

bf16: bool = True

attr epochs

epochs: int = 1

attr update_epochs_per_batch

update_epochs_per_batch: int = 1

Number of gradient update passes over each training batch.

attr train_batch_size

train_batch_size: int = 1024

See utils/utils.py::validate_batch_sizes for train, mini, and micro batch size constraints.

attr policy_mini_batch_size

policy_mini_batch_size: int = 256

attr critic_mini_batch_size

critic_mini_batch_size: int = 256

attr micro_train_batch_size_per_gpu

micro_train_batch_size_per_gpu: int = 1

attr micro_forward_batch_size_per_gpu

micro_forward_batch_size_per_gpu: int = 1

attr update_ref_every_epoch

update_ref_every_epoch: bool = False

attr use_sample_packing

use_sample_packing: bool = True

attr eval_batch_size

eval_batch_size: int = 1024

attr eval_before_train

eval_before_train: bool = True

attr eval_interval

eval_interval: int = 5

-1 to disable evaluation.

attr max_prompt_length

max_prompt_length: int = 512

attr flash_attn

flash_attn: bool = True

attr disable_fast_tokenizer

disable_fast_tokenizer: bool = False

attr project_name

project_name: str = 'skyrl'

attr run_name

run_name: str = 'test_run'

attr logger

logger: str = 'wandb'

attr dump_data_batch

dump_data_batch: bool = False

attr dump_eval_results

dump_eval_results: bool = True

attr rope_scaling

rope_scaling: Optional[Dict[str, Any]] = None

attr rope_theta

rope_theta: Optional[float] = None

class OptimizerConfig

OptimizerConfig(lr: float = 1e-06, adam_betas: List[float] = (lambda: [0.9, 0.999])(), weight_decay: float = 0.01, max_grad_norm: float = 1.0, offload_after_step: bool = True, num_warmup_steps: int = 0, scheduler: str = 'constant_with_warmup') -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
lrfloat
adam_betasList[float]
weight_decayfloat
max_grad_normfloat
offload_after_stepboolOffload optimizer state to CPU after each full training step. Only applicable when colocate_all=True.
num_warmup_stepsintNumber of mini-batch steps to warmup the optimizer.
schedulerstr
Source code in skyrl/train/config/config.py:76-86
@dataclass
class OptimizerConfig(BaseConfig):
    lr: float = 1e-6
    adam_betas: List[float] = field(default_factory=lambda: [0.9, 0.999])
    weight_decay: float = 1e-2
    max_grad_norm: float = 1.0
    offload_after_step: bool = True
    """Offload optimizer state to CPU after each full training step. Only applicable when ``colocate_all=True``."""
    num_warmup_steps: int = 0
    """Number of mini-batch steps to warmup the optimizer."""
    scheduler: str = "constant_with_warmup"

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr lr

lr: float = 1e-06

attr adam_betas

adam_betas: List[float] = field(default_factory=(lambda: [0.9, 0.999]))

attr weight_decay

weight_decay: float = 0.01

attr max_grad_norm

max_grad_norm: float = 1.0

attr offload_after_step

offload_after_step: bool = True

Offload optimizer state to CPU after each full training step. Only applicable when colocate_all=True.

attr num_warmup_steps

num_warmup_steps: int = 0

Number of mini-batch steps to warmup the optimizer.

attr scheduler

scheduler: str = 'constant_with_warmup'

class MixedPrecisionConfig

MixedPrecisionConfig(param_dtype: str = 'bf16', reduce_dtype: str = 'fp32', buffer_dtype: str = 'fp32') -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
param_dtypestr
reduce_dtypestr
buffer_dtypestr
Source code in skyrl/train/config/config.py:89-93
@dataclass
class MixedPrecisionConfig(BaseConfig):
    param_dtype: str = "bf16"
    reduce_dtype: str = "fp32"
    buffer_dtype: str = "fp32"

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr param_dtype

param_dtype: str = 'bf16'

attr reduce_dtype

reduce_dtype: str = 'fp32'

attr buffer_dtype

buffer_dtype: str = 'fp32'

Backend Config

class FSDPConfig

FSDPConfig(cpu_offload: bool = False, reshard_after_forward: Union[bool, int] = True, fsdp_size: int = -1, mixed_precision: Optional[MixedPrecisionConfig] = None, wrap_policy: dict = dict()) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
cpu_offloadboolOffload params and optimizer state to CPU during the forward pass.
reshard_after_forwardUnion[bool, int]FSDP2 only. Accepts True, False, or an int between 1 and fsdp_size.
fsdp_sizeint
mixed_precisionOptional[MixedPrecisionConfig]
wrap_policydict
Source code in skyrl/train/config/config.py:96-105
@dataclass
class FSDPConfig(BaseConfig):
    cpu_offload: bool = False
    """Offload params and optimizer state to CPU during the forward pass."""
    reshard_after_forward: Union[bool, int] = True
    """FSDP2 only. Accepts True, False, or an int between 1 and ``fsdp_size``."""
    fsdp_size: int = -1
    mixed_precision: Optional[MixedPrecisionConfig] = None
    # specify wrap policy as a dict with `transformer_layer_cls_to_wrap` key for custom module based wrapping
    wrap_policy: dict = field(default_factory=dict)

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr cpu_offload

cpu_offload: bool = False

Offload params and optimizer state to CPU during the forward pass.

attr reshard_after_forward

reshard_after_forward: Union[bool, int] = True

FSDP2 only. Accepts True, False, or an int between 1 and fsdp_size.

attr fsdp_size

fsdp_size: int = -1

attr mixed_precision

mixed_precision: Optional[MixedPrecisionConfig] = None

attr wrap_policy

wrap_policy: dict = field(default_factory=dict)

class MegatronConfig

MegatronConfig(tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, context_parallel_size: int = 1, expert_model_parallel_size: int = 1, expert_tensor_parallel_size: Optional[int] = None, moe_token_dispatcher_type: str = 'alltoall', moe_router_load_balancing_type: str = 'none', moe_grouped_gemm: bool = True, moe_router_score_function: Optional[str] = None, moe_router_enable_expert_bias: Optional[bool] = None, ddp_config: MegatronDDPConfig = MegatronDDPConfig(), torch_profiler_config: MegatronTorchProfilerConfig = MegatronTorchProfilerConfig(), lora_config: MegatronLoraConfig = MegatronLoraConfig(), optimizer_config_kwargs: Dict[str, Any] = (lambda: copy.deepcopy(DEFAULT_MEGATRON_OPTIMIZER_KWARGS))(), transformer_config_kwargs: Dict[str, Any] = (lambda: copy.deepcopy(DEFAULT_TRANSFORMER_CONFIG_KWARGS))(), empty_cuda_cache: Optional[bool] = None, model_config_kwargs: dict = dict(), dist_ckpt_optim_fully_reshardable: bool = False) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Source code in skyrl/train/config/config.py:148-172
@dataclass
class MegatronConfig(BaseConfig):
    tensor_model_parallel_size: int = 1
    pipeline_model_parallel_size: int = 1
    context_parallel_size: int = 1
    expert_model_parallel_size: int = 1
    expert_tensor_parallel_size: Optional[int] = None
    # MoE runtime configuration flags
    moe_token_dispatcher_type: str = "alltoall"
    moe_router_load_balancing_type: str = "none"
    moe_grouped_gemm: bool = True
    moe_router_score_function: Optional[str] = None
    moe_router_enable_expert_bias: Optional[bool] = None
    ddp_config: MegatronDDPConfig = field(default_factory=MegatronDDPConfig)
    torch_profiler_config: MegatronTorchProfilerConfig = field(default_factory=MegatronTorchProfilerConfig)
    lora_config: MegatronLoraConfig = field(default_factory=MegatronLoraConfig)
    optimizer_config_kwargs: Dict[str, Any] = field(
        default_factory=lambda: copy.deepcopy(DEFAULT_MEGATRON_OPTIMIZER_KWARGS)
    )
    transformer_config_kwargs: Dict[str, Any] = field(
        default_factory=lambda: copy.deepcopy(DEFAULT_TRANSFORMER_CONFIG_KWARGS)
    )
    empty_cuda_cache: Optional[bool] = None
    model_config_kwargs: dict = field(default_factory=dict)
    dist_ckpt_optim_fully_reshardable: bool = False

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr tensor_model_parallel_size

tensor_model_parallel_size: int = 1

attr pipeline_model_parallel_size

pipeline_model_parallel_size: int = 1

attr context_parallel_size

context_parallel_size: int = 1

attr expert_model_parallel_size

expert_model_parallel_size: int = 1

attr expert_tensor_parallel_size

expert_tensor_parallel_size: Optional[int] = None

attr moe_token_dispatcher_type

moe_token_dispatcher_type: str = 'alltoall'

attr moe_router_load_balancing_type

moe_router_load_balancing_type: str = 'none'

attr moe_grouped_gemm

moe_grouped_gemm: bool = True

attr moe_router_score_function

moe_router_score_function: Optional[str] = None

attr moe_router_enable_expert_bias

moe_router_enable_expert_bias: Optional[bool] = None

attr ddp_config

ddp_config: MegatronDDPConfig = field(default_factory=MegatronDDPConfig)

attr torch_profiler_config

torch_profiler_config: MegatronTorchProfilerConfig = field(default_factory=MegatronTorchProfilerConfig)

attr lora_config

lora_config: MegatronLoraConfig = field(default_factory=MegatronLoraConfig)

attr optimizer_config_kwargs

optimizer_config_kwargs: Dict[str, Any] = field(default_factory=(lambda: copy.deepcopy(DEFAULT_MEGATRON_OPTIMIZER_KWARGS)))

attr transformer_config_kwargs

transformer_config_kwargs: Dict[str, Any] = field(default_factory=(lambda: copy.deepcopy(DEFAULT_TRANSFORMER_CONFIG_KWARGS)))

attr empty_cuda_cache

empty_cuda_cache: Optional[bool] = None

attr model_config_kwargs

model_config_kwargs: dict = field(default_factory=dict)

attr dist_ckpt_optim_fully_reshardable

dist_ckpt_optim_fully_reshardable: bool = False

class MegatronDDPConfig

MegatronDDPConfig(grad_reduce_in_fp32: bool = True, overlap_grad_reduce: bool = False, overlap_param_gather: bool = False, average_in_collective: bool = True) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Source code in skyrl/train/config/config.py:113-118
@dataclass
class MegatronDDPConfig(BaseConfig):
    grad_reduce_in_fp32: bool = True
    overlap_grad_reduce: bool = False
    overlap_param_gather: bool = False
    average_in_collective: bool = True

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr grad_reduce_in_fp32

grad_reduce_in_fp32: bool = True

attr overlap_grad_reduce

overlap_grad_reduce: bool = False

attr overlap_param_gather

overlap_param_gather: bool = False

attr average_in_collective

average_in_collective: bool = True

class MegatronLoraConfig

MegatronLoraConfig(lora_type: str = 'lora') -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
lora_typestr
Source code in skyrl/train/config/config.py:128-130
@dataclass
class MegatronLoraConfig(BaseConfig):
    lora_type: str = "lora"

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr lora_type

lora_type: str = 'lora'

class MegatronTorchProfilerConfig

MegatronTorchProfilerConfig(enable: bool = False, ranks: List[int] = list(), save_path: Optional[str] = None) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
enablebool
ranksList[int]
save_pathOptional[str]
Source code in skyrl/train/config/config.py:121-125
@dataclass
class MegatronTorchProfilerConfig(BaseConfig):
    enable: bool = False
    ranks: List[int] = field(default_factory=list)
    save_path: Optional[str] = None

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr enable

enable: bool = False

attr ranks

ranks: List[int] = field(default_factory=list)

attr save_path

save_path: Optional[str] = None

Placement

class PlacementConfig

PlacementConfig(colocate_all: bool = True, colocate_policy_ref: bool = True, policy_num_nodes: int = 1, policy_num_gpus_per_node: int = 1, critic_num_nodes: int = 1, critic_num_gpus_per_node: int = 1, ref_num_nodes: int = 1, ref_num_gpus_per_node: int = 1) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
colocate_allboolWhen True, training and inference share the same GPUs.
colocate_policy_refbool
policy_num_nodesint
policy_num_gpus_per_nodeint
critic_num_nodesint
critic_num_gpus_per_nodeint
ref_num_nodesint
ref_num_gpus_per_nodeint
Source code in skyrl/train/config/config.py:180-190
@dataclass
class PlacementConfig(BaseConfig):
    colocate_all: bool = True
    """When True, training and inference share the same GPUs."""
    colocate_policy_ref: bool = True
    policy_num_nodes: int = 1
    policy_num_gpus_per_node: int = 1
    critic_num_nodes: int = 1
    critic_num_gpus_per_node: int = 1
    ref_num_nodes: int = 1
    ref_num_gpus_per_node: int = 1

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr colocate_all

colocate_all: bool = True

When True, training and inference share the same GPUs.

attr colocate_policy_ref

colocate_policy_ref: bool = True

attr policy_num_nodes

policy_num_nodes: int = 1

attr policy_num_gpus_per_node

policy_num_gpus_per_node: int = 1

attr critic_num_nodes

critic_num_nodes: int = 1

attr critic_num_gpus_per_node

critic_num_gpus_per_node: int = 1

attr ref_num_nodes

ref_num_nodes: int = 1

attr ref_num_gpus_per_node

ref_num_gpus_per_node: int = 1

Policy & Algorithm

class PolicyConfig

PolicyConfig(model: ModelConfig = (lambda: copy.deepcopy(ModelConfig(path='Qwen/Qwen2.5-1.5B-Instruct')))(), optimizer_config: OptimizerConfig = OptimizerConfig(), fsdp_config: FSDPConfig = FSDPConfig(), sequence_parallel_size: int = 1, use_torch_compile: bool = False, record_memory: bool = False, megatron_config: MegatronConfig = MegatronConfig(), model_config_kwargs: dict = dict()) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
modelModelConfig
optimizer_configOptimizerConfig
fsdp_configFSDPConfig
sequence_parallel_sizeint
use_torch_compileboolApply torch.compile to logits calculation.
record_memoryboolSave memory snapshots to {ckpt_path}/memory_snapshots/.
megatron_configMegatronConfig
model_config_kwargsdictPass-through kwargs for the HuggingFace model config (FSDP backends).
Source code in skyrl/train/config/config.py:198-212
@dataclass
class PolicyConfig(BaseConfig):
    model: ModelConfig = field(default_factory=lambda: copy.deepcopy(ModelConfig(path="Qwen/Qwen2.5-1.5B-Instruct")))
    optimizer_config: OptimizerConfig = field(default_factory=OptimizerConfig)
    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
    sequence_parallel_size: int = 1
    use_torch_compile: bool = False
    """Apply torch.compile to logits calculation."""
    record_memory: bool = False
    """Save memory snapshots to ``{ckpt_path}/memory_snapshots/``.
    Visualize by dragging pickle files to https://docs.pytorch.org/memory_viz."""
    megatron_config: MegatronConfig = field(default_factory=MegatronConfig)
    model_config_kwargs: dict = field(default_factory=dict)
    """Pass-through kwargs for the HuggingFace model config (FSDP backends).
    For Megatron, use ``policy.megatron_config.transformer_config_kwargs`` instead."""

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr model

model: ModelConfig = field(default_factory=(lambda: copy.deepcopy(ModelConfig(path='Qwen/Qwen2.5-1.5B-Instruct'))))

attr optimizer_config

optimizer_config: OptimizerConfig = field(default_factory=OptimizerConfig)

attr fsdp_config

fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)

attr sequence_parallel_size

sequence_parallel_size: int = 1

attr use_torch_compile

use_torch_compile: bool = False

Apply torch.compile to logits calculation.

attr record_memory

record_memory: bool = False

Save memory snapshots to {ckpt_path}/memory_snapshots/. Visualize by dragging pickle files to https://docs.pytorch.org/memory_viz.

attr megatron_config

megatron_config: MegatronConfig = field(default_factory=MegatronConfig)

attr model_config_kwargs

model_config_kwargs: dict = field(default_factory=dict)

Pass-through kwargs for the HuggingFace model config (FSDP backends). For Megatron, use policy.megatron_config.transformer_config_kwargs instead.

class CriticConfig

CriticConfig(model: ModelConfig = ModelConfig(), optimizer_config: OptimizerConfig = (lambda: OptimizerConfig(lr=5e-06))(), fsdp_config: FSDPConfig = FSDPConfig(), sequence_parallel_size: int = 1, model_config_kwargs: dict = dict()) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Source code in skyrl/train/config/config.py:215-221
@dataclass
class CriticConfig(BaseConfig):
    model: ModelConfig = field(default_factory=ModelConfig)
    optimizer_config: OptimizerConfig = field(default_factory=lambda: OptimizerConfig(lr=5e-6))
    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
    sequence_parallel_size: int = 1
    model_config_kwargs: dict = field(default_factory=dict)

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr model

model: ModelConfig = field(default_factory=ModelConfig)

attr optimizer_config

optimizer_config: OptimizerConfig = field(default_factory=(lambda: OptimizerConfig(lr=5e-06)))

attr fsdp_config

fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)

attr sequence_parallel_size

sequence_parallel_size: int = 1

attr model_config_kwargs

model_config_kwargs: dict = field(default_factory=dict)

class RefConfig

RefConfig(model: ModelConfig = ModelConfig(), sequence_parallel_size: int = 1, fsdp_config: FSDPConfig = FSDPConfig(), megatron_config: MegatronConfig = MegatronConfig(), model_config_kwargs: dict = dict()) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Source code in skyrl/train/config/config.py:225-231
@dataclass
class RefConfig(BaseConfig):
    model: ModelConfig = field(default_factory=ModelConfig)
    sequence_parallel_size: int = 1
    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
    megatron_config: MegatronConfig = field(default_factory=MegatronConfig)
    model_config_kwargs: dict = field(default_factory=dict)

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr model

model: ModelConfig = field(default_factory=ModelConfig)

attr sequence_parallel_size

sequence_parallel_size: int = 1

attr fsdp_config

fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)

attr megatron_config

megatron_config: MegatronConfig = field(default_factory=MegatronConfig)

attr model_config_kwargs

model_config_kwargs: dict = field(default_factory=dict)

class AlgorithmConfig

AlgorithmConfig(advantage_estimator: str = 'grpo', kl_ctrl: KLCtrlConfig = KLCtrlConfig(), kl_estimator_type: str = 'k3', use_kl_in_reward: bool = False, use_kl_loss: bool = True, kl_loss_coef: float = 0.001, use_entropy_loss: bool = False, entropy_loss_coef: float = 0.01, temperature: Optional[float] = None, advantage_batch_normalize: bool = False, value_head_prefix: str = 'value_head', policy_loss_type: str = 'regular', loss_reduction: str = 'token_mean', grpo_norm_by_std: bool = True, zero_variance_filter: bool = False, lambd: float = 1.0, gamma: float = 1.0, eps_clip_low: float = 0.2, eps_clip_high: float = 0.2, clip_ratio_c: float = 3.0, tis_imp_ratio_cap: float = -1.0, use_tis: bool = False, off_policy_correction: OffPolicyCorrectionConfig = OffPolicyCorrectionConfig(), sapo: SAPOConfig = SAPOConfig(), value_clip: float = 0.2, dynamic_sampling: DynamicSamplingConfig = DynamicSamplingConfig(), clip_cov: ClipCovConfig = ClipCovConfig(), kl_cov: KLCovConfig = KLCovConfig(), cispo: CISPOConfig = CISPOConfig(), max_seq_len: Optional[int] = None) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
advantage_estimatorstr"grpo", "gae", "rloo", "reinforce++", or custom via AdvantageEstimatorRegistry.
kl_ctrlKLCtrlConfigOnly used when use_kl_in_reward=True (not applied when use_kl_loss=True).
kl_estimator_typestr"k1", "k2", "k3", "abs". See http://joschu.net/blog/kl-approx.html.
use_kl_in_rewardboolApply KL penalty to rewards. Mutually exclusive with use_kl_loss.
use_kl_lossboolApply KL loss in the policy model. Mutually exclusive with use_kl_in_reward.
kl_loss_coeffloat
use_entropy_lossbool
entropy_loss_coeffloat
temperatureOptional[float]Temperature for scaling logits in policy loss computation.
advantage_batch_normalizebool
value_head_prefixstr
policy_loss_typestr"regular", "dual_clip", "gspo", "clip_cov", "kl_cov", or custom via PolicyLossRegistry.
loss_reductionstr"token_mean", "sequence_mean", or "seq_mean_token_sum_norm".
grpo_norm_by_stdbool
zero_variance_filterboolLoss-mask prompts with zero-variance rewards. Only applicable when rewards are response-level.
lambdfloat
gammafloat
eps_clip_lowfloat
eps_clip_highfloat
clip_ratio_cfloatDual-clip parameter.
tis_imp_ratio_capfloatDeprecated: use off_policy_correction.tis_ratio_type="token" and token_tis_ratio_clip_high instead.
use_tisboolDeprecated: use off_policy_correction instead.
off_policy_correctionOffPolicyCorrectionConfig
sapoSAPOConfig
value_clipfloat
dynamic_samplingDynamicSamplingConfig
clip_covClipCovConfigOnly used when policy_loss_type="clip_cov".
kl_covKLCovConfigOnly used when policy_loss_type="kl_cov".
cispoCISPOConfigOnly used when policy_loss_type="cispo".
max_seq_lenOptional[int]Used for seq_mean_token_sum_norm loss reduction; set explicitly for multi-turn.
Source code in skyrl/train/config/config.py:322-375
@dataclass
class AlgorithmConfig(BaseConfig):
    advantage_estimator: str = "grpo"
    """``"grpo"``, ``"gae"``, ``"rloo"``, ``"reinforce++"``, or custom via ``AdvantageEstimatorRegistry``."""
    kl_ctrl: KLCtrlConfig = field(default_factory=KLCtrlConfig)
    """Only used when ``use_kl_in_reward=True`` (not applied when ``use_kl_loss=True``).
    Uses ``kl_loss_coef`` as the initial KL coefficient."""
    kl_estimator_type: str = "k3"
    """``"k1"``, ``"k2"``, ``"k3"``, ``"abs"``. See http://joschu.net/blog/kl-approx.html."""
    use_kl_in_reward: bool = False
    """Apply KL penalty to rewards. Mutually exclusive with ``use_kl_loss``."""
    use_kl_loss: bool = True
    """Apply KL loss in the policy model. Mutually exclusive with ``use_kl_in_reward``."""
    kl_loss_coef: float = 0.001
    use_entropy_loss: bool = False
    entropy_loss_coef: float = 0.01
    temperature: Optional[float] = None
    """Temperature for scaling logits in policy loss computation.
    If ``None``, will be set to the temperature provided by ``generator.sampling_params.temperature`` during config validation.
    
    NOTE: When using HTTP endpoints directly, make sure to set this value to the temperature used during generation
    """
    advantage_batch_normalize: bool = False
    value_head_prefix: str = "value_head"
    policy_loss_type: str = "regular"
    """``"regular"``, ``"dual_clip"``, ``"gspo"``, ``"clip_cov"``, ``"kl_cov"``, or custom via ``PolicyLossRegistry``."""
    loss_reduction: str = "token_mean"
    """``"token_mean"``, ``"sequence_mean"``, or ``"seq_mean_token_sum_norm"``."""
    grpo_norm_by_std: bool = True
    zero_variance_filter: bool = False
    """Loss-mask prompts with zero-variance rewards. Only applicable when rewards are response-level."""
    lambd: float = 1.0
    gamma: float = 1.0
    eps_clip_low: float = 0.2
    eps_clip_high: float = 0.2
    clip_ratio_c: float = 3.0
    """Dual-clip parameter."""
    tis_imp_ratio_cap: float = -1.0
    """Deprecated: use ``off_policy_correction.tis_ratio_type="token"`` and ``token_tis_ratio_clip_high`` instead."""
    use_tis: bool = False
    """Deprecated: use ``off_policy_correction`` instead."""
    off_policy_correction: OffPolicyCorrectionConfig = field(default_factory=OffPolicyCorrectionConfig)
    sapo: SAPOConfig = field(default_factory=SAPOConfig)
    value_clip: float = 0.2
    dynamic_sampling: DynamicSamplingConfig = field(default_factory=DynamicSamplingConfig)
    clip_cov: ClipCovConfig = field(default_factory=ClipCovConfig)
    """Only used when ``policy_loss_type="clip_cov"``."""
    kl_cov: KLCovConfig = field(default_factory=KLCovConfig)
    """Only used when ``policy_loss_type="kl_cov"``."""
    cispo: CISPOConfig = field(default_factory=CISPOConfig)
    """Only used when ``policy_loss_type="cispo"``."""
    max_seq_len: Optional[int] = None
    """Used for ``seq_mean_token_sum_norm`` loss reduction; set explicitly for multi-turn.
    If ``None``, calculated as ``generator.max_input_length + generator.sampling_params.max_generate_length``."""

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr advantage_estimator

advantage_estimator: str = 'grpo'

"grpo", "gae", "rloo", "reinforce++", or custom via AdvantageEstimatorRegistry.

attr kl_ctrl

kl_ctrl: KLCtrlConfig = field(default_factory=KLCtrlConfig)

Only used when use_kl_in_reward=True (not applied when use_kl_loss=True). Uses kl_loss_coef as the initial KL coefficient.

attr kl_estimator_type

kl_estimator_type: str = 'k3'

"k1", "k2", "k3", "abs". See http://joschu.net/blog/kl-approx.html.

attr use_kl_in_reward

use_kl_in_reward: bool = False

Apply KL penalty to rewards. Mutually exclusive with use_kl_loss.

attr use_kl_loss

use_kl_loss: bool = True

Apply KL loss in the policy model. Mutually exclusive with use_kl_in_reward.

attr kl_loss_coef

kl_loss_coef: float = 0.001

attr use_entropy_loss

use_entropy_loss: bool = False

attr entropy_loss_coef

entropy_loss_coef: float = 0.01

attr temperature

temperature: Optional[float] = None

Temperature for scaling logits in policy loss computation. If None, will be set to the temperature provided by generator.sampling_params.temperature during config validation.

NOTE: When using HTTP endpoints directly, make sure to set this value to the temperature used during generation

attr advantage_batch_normalize

advantage_batch_normalize: bool = False

attr value_head_prefix

value_head_prefix: str = 'value_head'

attr policy_loss_type

policy_loss_type: str = 'regular'

"regular", "dual_clip", "gspo", "clip_cov", "kl_cov", or custom via PolicyLossRegistry.

attr loss_reduction

loss_reduction: str = 'token_mean'

"token_mean", "sequence_mean", or "seq_mean_token_sum_norm".

attr grpo_norm_by_std

grpo_norm_by_std: bool = True

attr zero_variance_filter

zero_variance_filter: bool = False

Loss-mask prompts with zero-variance rewards. Only applicable when rewards are response-level.

attr lambd

lambd: float = 1.0

attr gamma

gamma: float = 1.0

attr eps_clip_low

eps_clip_low: float = 0.2

attr eps_clip_high

eps_clip_high: float = 0.2

attr clip_ratio_c

clip_ratio_c: float = 3.0

Dual-clip parameter.

attr tis_imp_ratio_cap

tis_imp_ratio_cap: float = -1.0

Deprecated: use off_policy_correction.tis_ratio_type="token" and token_tis_ratio_clip_high instead.

attr use_tis

use_tis: bool = False

Deprecated: use off_policy_correction instead.

attr off_policy_correction

off_policy_correction: OffPolicyCorrectionConfig = field(default_factory=OffPolicyCorrectionConfig)

attr sapo

sapo: SAPOConfig = field(default_factory=SAPOConfig)

attr value_clip

value_clip: float = 0.2

attr dynamic_sampling

dynamic_sampling: DynamicSamplingConfig = field(default_factory=DynamicSamplingConfig)

attr clip_cov

clip_cov: ClipCovConfig = field(default_factory=ClipCovConfig)

Only used when policy_loss_type="clip_cov".

attr kl_cov

kl_cov: KLCovConfig = field(default_factory=KLCovConfig)

Only used when policy_loss_type="kl_cov".

attr cispo

cispo: CISPOConfig = field(default_factory=CISPOConfig)

Only used when policy_loss_type="cispo".

attr max_seq_len

max_seq_len: Optional[int] = None

Used for seq_mean_token_sum_norm loss reduction; set explicitly for multi-turn. If None, calculated as generator.max_input_length + generator.sampling_params.max_generate_length.

class KLCtrlConfig

KLCtrlConfig(type: str = 'fixed', kl_target: float = 0.1, horizon: int = 10000) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
typestr"fixed" or "adaptive".
kl_targetfloatTarget KL divergence for the adaptive KL controller.
horizonintControls the update rate of the adaptive KL controller.
Source code in skyrl/train/config/config.py:239-247
@dataclass
class KLCtrlConfig(BaseConfig):

    type: str = "fixed"
    """``"fixed"`` or ``"adaptive"``."""
    kl_target: float = 0.1
    """Target KL divergence for the adaptive KL controller."""
    horizon: int = 10000
    """Controls the update rate of the adaptive KL controller."""

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr type

type: str = 'fixed'

"fixed" or "adaptive".

attr kl_target

kl_target: float = 0.1

Target KL divergence for the adaptive KL controller.

attr horizon

horizon: int = 10000

Controls the update rate of the adaptive KL controller.

Algorithm Extensions

class SAPOConfig

SAPOConfig(tau_pos: float = 1.0, tau_neg: float = 1.05) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
tau_posfloat
tau_negfloat
Source code in skyrl/train/config/config.py:250-253
@dataclass
class SAPOConfig(BaseConfig):
    tau_pos: float = 1.0
    tau_neg: float = 1.05

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr tau_pos

tau_pos: float = 1.0

attr tau_neg

tau_neg: float = 1.05

class DynamicSamplingConfig

DynamicSamplingConfig(type: Optional[str] = None, max_sample_batches: int = 30, min_replace_ratio: float = 0.3) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
typeOptional[str]"filter", "replace", or None.
max_sample_batchesintSample at most this many batches before stopping. -1 to sample forever.
min_replace_ratiofloatMinimum proportion of good samples to replace bad samples. Only used with "replace" strategy.
Source code in skyrl/train/config/config.py:256-263
@dataclass
class DynamicSamplingConfig(BaseConfig):
    type: Optional[str] = None
    """``"filter"``, ``"replace"``, or ``None``."""
    max_sample_batches: int = 30
    """Sample at most this many batches before stopping. ``-1`` to sample forever."""
    min_replace_ratio: float = 0.3
    """Minimum proportion of good samples to replace bad samples. Only used with ``"replace"`` strategy."""

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr type

type: Optional[str] = None

"filter", "replace", or None.

attr max_sample_batches

max_sample_batches: int = 30

Sample at most this many batches before stopping. -1 to sample forever.

attr min_replace_ratio

min_replace_ratio: float = 0.3

Minimum proportion of good samples to replace bad samples. Only used with "replace" strategy.

class ClipCovConfig

ClipCovConfig(clip_ratio: float = 0.0002, clip_cov_lb: float = 1.0, clip_cov_ub: float = 5.0) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
clip_ratiofloatFraction of tokens to clip based on covariance.
clip_cov_lbfloat
clip_cov_ubfloat
Source code in skyrl/train/config/config.py:266-272
@dataclass
class ClipCovConfig(BaseConfig):

    clip_ratio: float = 0.0002
    """Fraction of tokens to clip based on covariance."""
    clip_cov_lb: float = 1.0
    clip_cov_ub: float = 5.0

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr clip_ratio

clip_ratio: float = 0.0002

Fraction of tokens to clip based on covariance.

attr clip_cov_lb

clip_cov_lb: float = 1.0

attr clip_cov_ub

clip_cov_ub: float = 5.0

class KLCovConfig

KLCovConfig(kl_cov_frac: float = 0.2, ppo_kl_coef: float = 1.0) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
kl_cov_fracfloatFraction of tokens to apply KL regularization to.
ppo_kl_coeffloat
Source code in skyrl/train/config/config.py:275-280
@dataclass
class KLCovConfig(BaseConfig):

    kl_cov_frac: float = 0.2
    """Fraction of tokens to apply KL regularization to."""
    ppo_kl_coef: float = 1.0

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr kl_cov_frac

kl_cov_frac: float = 0.2

Fraction of tokens to apply KL regularization to.

attr ppo_kl_coef

ppo_kl_coef: float = 1.0

class CISPOConfig

CISPOConfig(cispo_eps_clip_low: float = 0.0, cispo_eps_clip_high: float = 5.0) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
cispo_eps_clip_lowfloatOffset for lower bound of importance sampling ratio clipping (as opposed to PPO token update clipping).
cispo_eps_clip_highfloatOffset for upper bound of importance sampling ratio clipping (as opposed to PPO token update clipping).
Source code in skyrl/train/config/config.py:283-289
@dataclass
class CISPOConfig(BaseConfig):

    cispo_eps_clip_low: float = 0.0
    """Offset for lower bound of importance sampling ratio clipping (as opposed to PPO token update clipping)."""
    cispo_eps_clip_high: float = 5.0
    """Offset for upper bound of importance sampling ratio clipping (as opposed to PPO token update clipping)."""

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr cispo_eps_clip_low

cispo_eps_clip_low: float = 0.0

Offset for lower bound of importance sampling ratio clipping (as opposed to PPO token update clipping).

attr cispo_eps_clip_high

cispo_eps_clip_high: float = 5.0

Offset for upper bound of importance sampling ratio clipping (as opposed to PPO token update clipping).

class OffPolicyCorrectionConfig

OffPolicyCorrectionConfig(tis_ratio_type: Optional[str] = None, token_tis_ratio_clip_high: float = 2.0, sequence_tis_ratio_clip_high: float = 5.0, sequence_mask_metric: Optional[str] = None, geo_mask_high: float = 1.01, geo_mask_low: float = 0.99, product_mask_high: float = 2.0, product_mask_low: float = 0.5, outlier_token_is_threshold_low: Optional[float] = None, outlier_token_is_threshold_high: Optional[float] = None, token_mask_is_threshold_low: Optional[float] = None, token_mask_is_threshold_high: Optional[float] = None) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
tis_ratio_typeOptional[str]Importance sampling ratio type for PPO loss correction: None, "token", or "sequence".
token_tis_ratio_clip_highfloatUsed when tis_ratio_type="token". Recommended range: 1.5--5.0.
sequence_tis_ratio_clip_highfloatUsed when tis_ratio_type="sequence". Recommended range: 2.0--10.0.
sequence_mask_metricOptional[str]Method for masking sequences with cumulative IS ratios outside cap: None, "product", or "geometric".
geo_mask_highfloatUsed when sequence_mask_metric="geometric". Recommended ~0.99--1.01; MoE models may need a wider range.
geo_mask_lowfloatUsed when sequence_mask_metric="geometric".
product_mask_highfloatUsed when sequence_mask_metric="product". Recommended ~0.5--2.0.
product_mask_lowfloatUsed when sequence_mask_metric="product".
outlier_token_is_threshold_lowOptional[float]Set to mask sequences with any token IS ratio below this threshold. Suggested: 1e-4. None to disable.
outlier_token_is_threshold_highOptional[float]Set to mask sequences with any token IS ratio above this threshold. Suggested: 100. None to disable.
token_mask_is_threshold_lowOptional[float]Set to mask per-token when IS ratio < token_mask_is_threshold_low. None to disable.
token_mask_is_threshold_highOptional[float]Set to mask per-token when IS ratio > token_mask_is_threshold_high. None to disable.
Source code in skyrl/train/config/config.py:293-319
@dataclass
class OffPolicyCorrectionConfig(BaseConfig):
    tis_ratio_type: Optional[str] = None
    """Importance sampling ratio type for PPO loss correction: ``None``, ``"token"``, or ``"sequence"``.
    The ratio is ``exp(logprobs_policy_old - logprobs_rollout_policy)``."""
    token_tis_ratio_clip_high: float = 2.0
    """Used when ``tis_ratio_type="token"``. Recommended range: 1.5--5.0."""
    sequence_tis_ratio_clip_high: float = 5.0
    """Used when ``tis_ratio_type="sequence"``. Recommended range: 2.0--10.0."""
    sequence_mask_metric: Optional[str] = None
    """Method for masking sequences with cumulative IS ratios outside cap: ``None``, ``"product"``, or ``"geometric"``."""
    geo_mask_high: float = 1.01
    """Used when ``sequence_mask_metric="geometric"``. Recommended ~0.99--1.01; MoE models may need a wider range."""
    geo_mask_low: float = 0.99
    """Used when ``sequence_mask_metric="geometric"``."""
    product_mask_high: float = 2.0
    """Used when ``sequence_mask_metric="product"``. Recommended ~0.5--2.0."""
    product_mask_low: float = 0.5
    """Used when ``sequence_mask_metric="product"``."""
    outlier_token_is_threshold_low: Optional[float] = None
    """Set to mask sequences with any token IS ratio below this threshold. Suggested: 1e-4. ``None`` to disable."""
    outlier_token_is_threshold_high: Optional[float] = None
    """Set to mask sequences with any token IS ratio above this threshold. Suggested: 100. ``None`` to disable."""
    token_mask_is_threshold_low: Optional[float] = None
    """Set to mask per-token when IS ratio < `token_mask_is_threshold_low`. ``None`` to disable."""
    token_mask_is_threshold_high: Optional[float] = None
    """Set to mask per-token when IS ratio > `token_mask_is_threshold_high`. ``None`` to disable."""

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr tis_ratio_type

tis_ratio_type: Optional[str] = None

Importance sampling ratio type for PPO loss correction: None, "token", or "sequence". The ratio is exp(logprobs_policy_old - logprobs_rollout_policy).

attr token_tis_ratio_clip_high

token_tis_ratio_clip_high: float = 2.0

Used when tis_ratio_type="token". Recommended range: 1.5--5.0.

attr sequence_tis_ratio_clip_high

sequence_tis_ratio_clip_high: float = 5.0

Used when tis_ratio_type="sequence". Recommended range: 2.0--10.0.

attr sequence_mask_metric

sequence_mask_metric: Optional[str] = None

Method for masking sequences with cumulative IS ratios outside cap: None, "product", or "geometric".

attr geo_mask_high

geo_mask_high: float = 1.01

Used when sequence_mask_metric="geometric". Recommended ~0.99--1.01; MoE models may need a wider range.

attr geo_mask_low

geo_mask_low: float = 0.99

Used when sequence_mask_metric="geometric".

attr product_mask_high

product_mask_high: float = 2.0

Used when sequence_mask_metric="product". Recommended ~0.5--2.0.

attr product_mask_low

product_mask_low: float = 0.5

Used when sequence_mask_metric="product".

attr outlier_token_is_threshold_low

outlier_token_is_threshold_low: Optional[float] = None

Set to mask sequences with any token IS ratio below this threshold. Suggested: 1e-4. None to disable.

attr outlier_token_is_threshold_high

outlier_token_is_threshold_high: Optional[float] = None

Set to mask sequences with any token IS ratio above this threshold. Suggested: 100. None to disable.

attr token_mask_is_threshold_low

token_mask_is_threshold_low: Optional[float] = None

Set to mask per-token when IS ratio < token_mask_is_threshold_low. None to disable.

attr token_mask_is_threshold_high

token_mask_is_threshold_high: Optional[float] = None

Set to mask per-token when IS ratio > token_mask_is_threshold_high. None to disable.

class FullyAsyncConfig

FullyAsyncConfig(max_staleness_steps: int = 4, num_parallel_generation_workers: int = 768) -> None

Bases: BaseConfig

Knobs for fully async training. See https://docs.skyrl.ai/docs/tutorials/fully_async#step-2-config-knobs-to-tune-for-fully-async-training.

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
max_staleness_stepsintMaximum off-policy steps allowed. If a trajectory group is scheduled at step i and trained at step j,
num_parallel_generation_workersintNumber of generation workers to spawn. Should be >= policy_mini_batch_size and
Source code in skyrl/train/config/config.py:383-393
@dataclass
class FullyAsyncConfig(BaseConfig):
    """Knobs for fully async training.
    See https://docs.skyrl.ai/docs/tutorials/fully_async#step-2-config-knobs-to-tune-for-fully-async-training."""

    max_staleness_steps: int = 4
    """Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*,
    then ``j - i <= max_staleness_steps``. Larger values increase throughput but also off-policy-ness."""
    num_parallel_generation_workers: int = 768
    """Number of generation workers to spawn. Should be >= ``policy_mini_batch_size`` and
    <= ``policy_mini_batch_size * (max_staleness_steps + 1)``."""

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr max_staleness_steps

max_staleness_steps: int = 4

Maximum off-policy steps allowed. If a trajectory group is scheduled at step i and trained at step j, then j - i <= max_staleness_steps. Larger values increase throughput but also off-policy-ness.

attr num_parallel_generation_workers

num_parallel_generation_workers: int = 768

Number of generation workers to spawn. Should be >= policy_mini_batch_size and <= policy_mini_batch_size * (max_staleness_steps + 1).

Inference & Generation

class SamplingParams

SamplingParams(max_generate_length: int = 1024, repetition_penalty: float = 1.0, temperature: float = 1.0, top_p: float = 1.0, min_p: float = 0.0, top_k: int = -1, logprobs: Optional[int] = 1, stop: Optional[List[str]] = None, additional_kwargs: Optional[Dict[str, Any]] = None) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
max_generate_lengthint
repetition_penaltyfloat
temperaturefloat
top_pfloat
min_pfloat
top_kint
logprobsOptional[int]
stopOptional[List[str]]
additional_kwargsOptional[Dict[str, Any]]
Source code in skyrl/train/config/config.py:401-411
@dataclass
class SamplingParams(BaseConfig):
    max_generate_length: int = 1024
    repetition_penalty: float = 1.0
    temperature: float = 1.0
    top_p: float = 1.0
    min_p: float = 0.0
    top_k: int = -1
    logprobs: Optional[int] = 1
    stop: Optional[List[str]] = None
    additional_kwargs: Optional[Dict[str, Any]] = None

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr max_generate_length

max_generate_length: int = 1024

attr repetition_penalty

repetition_penalty: float = 1.0

attr temperature

temperature: float = 1.0

attr top_p

top_p: float = 1.0

attr min_p

min_p: float = 0.0

attr top_k

top_k: int = -1

attr logprobs

logprobs: Optional[int] = 1

attr stop

stop: Optional[List[str]] = None

attr additional_kwargs

additional_kwargs: Optional[Dict[str, Any]] = None

class ChatTemplateConfig

ChatTemplateConfig(source: str = 'name', name_or_path: Optional[str] = None) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
sourcestr
name_or_pathOptional[str]
Source code in skyrl/train/config/config.py:414-417
@dataclass
class ChatTemplateConfig(BaseConfig):
    source: str = "name"
    name_or_path: Optional[str] = None

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr source

source: str = 'name'

attr name_or_path

name_or_path: Optional[str] = None

class InferenceEngineConfig

InferenceEngineConfig(model_dtype: str = 'bfloat16', run_engines_locally: bool = True, num_engines: int = 1, backend: str = 'vllm', weight_sync_backend: str = 'nccl', weight_transfer_threshold_cuda_ipc_GB: float = 1.0, tensor_parallel_size: int = 1, pipeline_parallel_size: int = 1, expert_parallel_size: int = 1, data_parallel_size: int = 1, async_engine: bool = True, vllm_v1_disable_multiproc: bool = True, enable_prefix_caching: bool = True, enable_chunked_prefill: bool = True, max_num_batched_tokens: int = 8192, enforce_eager: bool = True, fully_sharded_loras: bool = False, enable_ray_prometheus_stats: bool = False, gpu_memory_utilization: float = 0.8, max_num_seqs: int = 1024, remote_urls: List[str] = (lambda: [])(), enable_http_endpoint: bool = False, http_endpoint_host: str = '127.0.0.1', http_endpoint_port: int = 8000, served_model_name: Optional[str] = None, distributed_executor_backend: str = 'ray', engine_init_kwargs: Dict[str, Any] = dict(), override_existing_update_group: str = 'auto', external_proxy_url: Optional[str] = None, external_server_urls: Optional[List[str]] = None) -> None

Bases: BaseConfig

Configuration for inference engine instantiation and management.

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
model_dtypestrShould match the dtype used by the inference engine.
run_engines_locallybool
num_enginesint
backendstr"vllm".
weight_sync_backendstr
weight_transfer_threshold_cuda_ipc_GBfloatWhen using cuda_ipc, send weights in batches of this size (GB).
tensor_parallel_sizeint
pipeline_parallel_sizeint
expert_parallel_sizeint
data_parallel_sizeint
async_enginebool
vllm_v1_disable_multiprocboolSets VLLM_ENABLE_V1_MULTIPROCESSING=0 for reproducibility.
enable_prefix_cachingbool
enable_chunked_prefillbool
max_num_batched_tokensint
enforce_eagerboolDisable CUDA graphs for stability. Set to False for higher performance,
fully_sharded_lorasbool
enable_ray_prometheus_statsboolEnable Ray Prometheus stats logger for inference engine metrics (vLLM v1 only).
gpu_memory_utilizationfloat
max_num_seqsint
remote_urlsList[str]
enable_http_endpointboolWhen True, launch an OpenAI-compatible HTTP endpoint for the inference engine client so that generators can send requests to this server instead of using .generate() Python calls.
http_endpoint_hoststr
http_endpoint_portint
served_model_nameOptional[str]Model name for HTTP endpoint validation. If set, must be used in the model field of
distributed_executor_backendstrDistributed executor backend for vLLM. Set to "ray" to use the Ray backend
engine_init_kwargsDict[str, Any]Pass-through kwargs for the vLLM engine. Names must match the engine's args.
override_existing_update_groupstr"auto", "enable", or "disable".
external_proxy_urlOptional[str]Data-plane URL (load-balanced router) for the new inference layer.
external_server_urlsOptional[List[str]]Control-plane URLs (direct backend access) for the new inference layer.
Source code in skyrl/train/config/config.py:425-478
@dataclass
class InferenceEngineConfig(BaseConfig):
    """Configuration for inference engine instantiation and management."""

    model_dtype: str = "bfloat16"
    """Should match the dtype used by the inference engine."""
    run_engines_locally: bool = True
    num_engines: int = 1
    backend: str = "vllm"
    """``"vllm"``."""
    weight_sync_backend: str = "nccl"
    weight_transfer_threshold_cuda_ipc_GB: float = 1.0
    """When using ``cuda_ipc``, send weights in batches of this size (GB)."""
    tensor_parallel_size: int = 1
    pipeline_parallel_size: int = 1
    expert_parallel_size: int = 1
    data_parallel_size: int = 1
    async_engine: bool = True
    vllm_v1_disable_multiproc: bool = True
    """Sets ``VLLM_ENABLE_V1_MULTIPROCESSING=0`` for reproducibility."""
    enable_prefix_caching: bool = True
    enable_chunked_prefill: bool = True
    max_num_batched_tokens: int = 8192
    enforce_eager: bool = True
    """Disable CUDA graphs for stability. Set to ``False`` for higher performance,
    but this may affect convergence for long-running or long-context training jobs."""
    fully_sharded_loras: bool = False
    enable_ray_prometheus_stats: bool = False
    """Enable Ray Prometheus stats logger for inference engine metrics (vLLM v1 only)."""
    gpu_memory_utilization: float = 0.8
    max_num_seqs: int = 1024
    remote_urls: List[str] = field(default_factory=lambda: [])
    enable_http_endpoint: bool = False
    """When ``True``, launch an OpenAI-compatible HTTP endpoint for the inference engine client so that generators can send requests to this server instead of using ``.generate()`` Python calls.
    
    NOTE: When using HTTP endpoints directly, make sure to set ``trainer.algorithm.temperature`` to the temperature used during generation
    """
    http_endpoint_host: str = "127.0.0.1"
    http_endpoint_port: int = 8000
    served_model_name: Optional[str] = None
    """Model name for HTTP endpoint validation. If set, must be used in the ``model`` field of
    ``/chat/completions`` requests instead of the model path. If ``None``, the model path is used."""
    distributed_executor_backend: str = "ray"
    """Distributed executor backend for vLLM. Set to ``"ray"`` to use the Ray backend
    or ``"mp"`` to use the multiprocessing backend (single-node serving only). Per-engine 
    placement groups are created when ``"mp"`` is used."""
    engine_init_kwargs: Dict[str, Any] = field(default_factory=dict)
    """Pass-through kwargs for the vLLM engine. Names must match the engine's args."""
    override_existing_update_group: str = "auto"
    """``"auto"``, ``"enable"``, or ``"disable"``."""
    external_proxy_url: Optional[str] = None
    """Data-plane URL (load-balanced router) for the new inference layer."""
    external_server_urls: Optional[List[str]] = None
    """Control-plane URLs (direct backend access) for the new inference layer."""

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr model_dtype

model_dtype: str = 'bfloat16'

Should match the dtype used by the inference engine.

attr run_engines_locally

run_engines_locally: bool = True

attr num_engines

num_engines: int = 1

attr backend

backend: str = 'vllm'

"vllm".

attr weight_sync_backend

weight_sync_backend: str = 'nccl'

attr weight_transfer_threshold_cuda_ipc_GB

weight_transfer_threshold_cuda_ipc_GB: float = 1.0

When using cuda_ipc, send weights in batches of this size (GB).

attr tensor_parallel_size

tensor_parallel_size: int = 1

attr pipeline_parallel_size

pipeline_parallel_size: int = 1

attr expert_parallel_size

expert_parallel_size: int = 1

attr data_parallel_size

data_parallel_size: int = 1

attr async_engine

async_engine: bool = True

attr vllm_v1_disable_multiproc

vllm_v1_disable_multiproc: bool = True

Sets VLLM_ENABLE_V1_MULTIPROCESSING=0 for reproducibility.

attr enable_prefix_caching

enable_prefix_caching: bool = True

attr enable_chunked_prefill

enable_chunked_prefill: bool = True

attr max_num_batched_tokens

max_num_batched_tokens: int = 8192

attr enforce_eager

enforce_eager: bool = True

Disable CUDA graphs for stability. Set to False for higher performance, but this may affect convergence for long-running or long-context training jobs.

attr fully_sharded_loras

fully_sharded_loras: bool = False

attr enable_ray_prometheus_stats

enable_ray_prometheus_stats: bool = False

Enable Ray Prometheus stats logger for inference engine metrics (vLLM v1 only).

attr gpu_memory_utilization

gpu_memory_utilization: float = 0.8

attr max_num_seqs

max_num_seqs: int = 1024

attr remote_urls

remote_urls: List[str] = field(default_factory=(lambda: []))

attr enable_http_endpoint

enable_http_endpoint: bool = False

When True, launch an OpenAI-compatible HTTP endpoint for the inference engine client so that generators can send requests to this server instead of using .generate() Python calls.

NOTE: When using HTTP endpoints directly, make sure to set trainer.algorithm.temperature to the temperature used during generation

attr http_endpoint_host

http_endpoint_host: str = '127.0.0.1'

attr http_endpoint_port

http_endpoint_port: int = 8000

attr served_model_name

served_model_name: Optional[str] = None

Model name for HTTP endpoint validation. If set, must be used in the model field of /chat/completions requests instead of the model path. If None, the model path is used.

attr distributed_executor_backend

distributed_executor_backend: str = 'ray'

Distributed executor backend for vLLM. Set to "ray" to use the Ray backend or "mp" to use the multiprocessing backend (single-node serving only). Per-engine placement groups are created when "mp" is used.

attr engine_init_kwargs

engine_init_kwargs: Dict[str, Any] = field(default_factory=dict)

Pass-through kwargs for the vLLM engine. Names must match the engine's args.

attr override_existing_update_group

override_existing_update_group: str = 'auto'

"auto", "enable", or "disable".

attr external_proxy_url

external_proxy_url: Optional[str] = None

Data-plane URL (load-balanced router) for the new inference layer.

attr external_server_urls

external_server_urls: Optional[List[str]] = None

Control-plane URLs (direct backend access) for the new inference layer.

class GeneratorConfig

GeneratorConfig(inference_engine: InferenceEngineConfig = InferenceEngineConfig(), n_samples_per_prompt: int = 5, batched: bool = False, max_turns: int = 1, max_input_length: Optional[int] = None, chat_template: ChatTemplateConfig = ChatTemplateConfig(), chat_template_kwargs: Dict[str, Any] = dict(), sampling_params: SamplingParams = SamplingParams(), use_conversation_multi_turn: bool = True, append_eos_token_after_stop_str_in_multi_turn: bool = True, eval_sampling_params: Optional[SamplingParams] = None, eval_n_samples_per_prompt: int = 1, zero_reward_on_non_stop: bool = False, apply_overlong_filtering: bool = False, rope_scaling: Optional[Dict[str, Any]] = None, rope_theta: Optional[float] = None, step_wise_trajectories: bool = False) -> None

Bases: BaseConfig

Configuration for generation behavior.

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
inference_engineInferenceEngineConfig
n_samples_per_promptint
batchedbool
max_turnsint
max_input_lengthOptional[int]Max generator input length for multi-turn conversations. For single-turn, set equal to max_prompt_length.
chat_templateChatTemplateConfig
chat_template_kwargsDict[str, Any]Kwargs passed to tokenizer.apply_chat_template.
sampling_paramsSamplingParams
use_conversation_multi_turnboolIf True, each multi-turn model response and env observation is stored in a separate
append_eos_token_after_stop_str_in_multi_turnboolWhen use_conversation_multi_turn=True and sampling_params.stop is set, append
eval_sampling_paramsOptional[SamplingParams]Separate sampling params for evaluation. If None, then it defaults to SamplingParams(temperature=0.0, max_generate_length=generator.sampling_params.max_generate_length).
eval_n_samples_per_promptint
zero_reward_on_non_stopboolSet reward to 0 when stop_reason is not "stop" (i.e., generation was truncated or aborted).
apply_overlong_filteringboolApply DAPO Overlong Filtering: mask out all tokens in the loss mask for trajectories that
rope_scalingOptional[Dict[str, Any]]Can differ from the trainer's rope_scaling, useful for thinking models.
rope_thetaOptional[float]
step_wise_trajectoriesbool
Source code in skyrl/train/config/config.py:486-524
@dataclass
class GeneratorConfig(BaseConfig):
    """Configuration for generation behavior."""

    inference_engine: InferenceEngineConfig = field(default_factory=InferenceEngineConfig)
    n_samples_per_prompt: int = 5
    batched: bool = False
    max_turns: int = 1
    max_input_length: Optional[int] = None
    """Max generator input length for multi-turn conversations. For single-turn, set equal to ``max_prompt_length``."""
    chat_template: ChatTemplateConfig = field(default_factory=ChatTemplateConfig)
    chat_template_kwargs: Dict[str, Any] = field(default_factory=dict)
    """Kwargs passed to ``tokenizer.apply_chat_template``."""
    sampling_params: SamplingParams = field(default_factory=SamplingParams)
    use_conversation_multi_turn: bool = True
    """If ``True``, each multi-turn model response and env observation is stored in a separate
    assistant/user message. If ``False``, they are appended to the original assistant response."""
    append_eos_token_after_stop_str_in_multi_turn: bool = True
    """When ``use_conversation_multi_turn=True`` and ``sampling_params.stop`` is set, append
    ``eos_token_id`` to generations that end with a matched stop string."""
    eval_sampling_params: Optional[SamplingParams] = None
    """Separate sampling params for evaluation. If ``None``, then it defaults to ``SamplingParams(temperature=0.0, max_generate_length=generator.sampling_params.max_generate_length)``."""
    eval_n_samples_per_prompt: int = 1
    zero_reward_on_non_stop: bool = False
    """Set reward to 0 when ``stop_reason`` is not ``"stop"`` (i.e., generation was truncated or aborted)."""
    apply_overlong_filtering: bool = False
    """Apply DAPO Overlong Filtering: mask out all tokens in the loss mask for trajectories that
    exceed max length (truncated, no EOS token)."""
    rope_scaling: Optional[Dict[str, Any]] = None
    """Can differ from the trainer's ``rope_scaling``, useful for thinking models."""
    rope_theta: Optional[float] = None
    step_wise_trajectories: bool = False

    def __post_init__(self):

        if self.eval_sampling_params is None:
            self.eval_sampling_params = SamplingParams(
                temperature=0.0, max_generate_length=self.sampling_params.max_generate_length
            )

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr inference_engine

inference_engine: InferenceEngineConfig = field(default_factory=InferenceEngineConfig)

attr n_samples_per_prompt

n_samples_per_prompt: int = 5

attr batched

batched: bool = False

attr max_turns

max_turns: int = 1

attr max_input_length

max_input_length: Optional[int] = None

Max generator input length for multi-turn conversations. For single-turn, set equal to max_prompt_length.

attr chat_template

chat_template: ChatTemplateConfig = field(default_factory=ChatTemplateConfig)

attr chat_template_kwargs

chat_template_kwargs: Dict[str, Any] = field(default_factory=dict)

Kwargs passed to tokenizer.apply_chat_template.

attr sampling_params

sampling_params: SamplingParams = field(default_factory=SamplingParams)

attr use_conversation_multi_turn

use_conversation_multi_turn: bool = True

If True, each multi-turn model response and env observation is stored in a separate assistant/user message. If False, they are appended to the original assistant response.

attr append_eos_token_after_stop_str_in_multi_turn

append_eos_token_after_stop_str_in_multi_turn: bool = True

When use_conversation_multi_turn=True and sampling_params.stop is set, append eos_token_id to generations that end with a matched stop string.

attr eval_sampling_params

eval_sampling_params: Optional[SamplingParams] = None

Separate sampling params for evaluation. If None, then it defaults to SamplingParams(temperature=0.0, max_generate_length=generator.sampling_params.max_generate_length).

attr eval_n_samples_per_prompt

eval_n_samples_per_prompt: int = 1

attr zero_reward_on_non_stop

zero_reward_on_non_stop: bool = False

Set reward to 0 when stop_reason is not "stop" (i.e., generation was truncated or aborted).

attr apply_overlong_filtering

apply_overlong_filtering: bool = False

Apply DAPO Overlong Filtering: mask out all tokens in the loss mask for trajectories that exceed max length (truncated, no EOS token).

attr rope_scaling

rope_scaling: Optional[Dict[str, Any]] = None

Can differ from the trainer's rope_scaling, useful for thinking models.

attr rope_theta

rope_theta: Optional[float] = None

attr step_wise_trajectories

step_wise_trajectories: bool = False

Environment

class EnvironmentConfig

EnvironmentConfig(env_class: str = 'gsm8k', skyrl_gym: SkyRLGymConfig = SkyRLGymConfig()) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
env_classstr
skyrl_gymSkyRLGymConfig
Source code in skyrl/train/config/config.py:547-550
@dataclass
class EnvironmentConfig(BaseConfig):
    env_class: str = "gsm8k"
    skyrl_gym: SkyRLGymConfig = field(default_factory=SkyRLGymConfig)

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr env_class

env_class: str = 'gsm8k'

attr skyrl_gym

skyrl_gym: SkyRLGymConfig = field(default_factory=SkyRLGymConfig)

class SkyRLGymConfig

SkyRLGymConfig(max_env_workers: int = 32, text2sql: Text2SQLEnvConfig = Text2SQLEnvConfig(), llm_as_a_judge: GSM8kLLMJudgeEnvConfig = GSM8kLLMJudgeEnvConfig(), search: SearchEnvConfig = SearchEnvConfig()) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
max_env_workersint
text2sqlText2SQLEnvConfig
llm_as_a_judgeGSM8kLLMJudgeEnvConfig
searchSearchEnvConfig
Source code in skyrl/train/config/config.py:539-544
@dataclass
class SkyRLGymConfig(BaseConfig):
    max_env_workers: int = 32
    text2sql: Text2SQLEnvConfig = field(default_factory=Text2SQLEnvConfig)
    llm_as_a_judge: GSM8kLLMJudgeEnvConfig = field(default_factory=GSM8kLLMJudgeEnvConfig)
    search: SearchEnvConfig = field(default_factory=SearchEnvConfig)

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr max_env_workers

max_env_workers: int = 32

attr text2sql

text2sql: Text2SQLEnvConfig = field(default_factory=Text2SQLEnvConfig)

attr llm_as_a_judge

llm_as_a_judge: GSM8kLLMJudgeEnvConfig = field(default_factory=GSM8kLLMJudgeEnvConfig)
search: SearchEnvConfig = field(default_factory=SearchEnvConfig)

class GSM8kLLMJudgeEnvConfig

GSM8kLLMJudgeEnvConfig(model: str = 'gpt-4o-mini', base_url: Optional[str] = None) -> None

Bases: BaseConfig

Functions:

NameDescription
from_dict_configConstruct a typed BaseConfig from a Hydra DictConfig.

Attributes:

NameTypeDescription
modelstr
base_urlOptional[str]
Source code in skyrl/train/config/config.py:533-536
@dataclass
class GSM8kLLMJudgeEnvConfig(BaseConfig):
    model: str = "gpt-4o-mini"
    base_url: Optional[str] = None

from_dict_config

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr model

model: str = 'gpt-4o-mini'

attr base_url

base_url: Optional[str] = None

On this page

Top-Level Configclass SkyRLTrainConfigfrom_dict_configattr dataattr trainerattr generatorattr environmentmethod classmethod from_cli_overridesmethod make_configmethod get_config_as_dictmethod get_config_as_yaml_strData & Modelclass DataConfigfrom_dict_configattr train_dataattr val_dataclass ModelConfigfrom_dict_configattr pathattr loraclass SkyRLLoraConfigfrom_dict_configattr rankattr alphaattr dropoutattr lora_sync_pathattr target_modulesattr exclude_modulesattr init_methodTrainingclass TrainerConfigfrom_dict_configattr placementattr sequence_parallel_backendattr strategyattr policyattr refattr criticattr algorithmattr fully_asyncattr gradient_checkpointingattr gradient_checkpointing_use_reentrantattr seedattr resume_modeattr resume_pathattr log_pathattr ckpt_pathattr max_ckpts_to_keepattr ckpt_intervalattr hf_save_intervalattr export_pathattr bf16attr epochsattr update_epochs_per_batchattr train_batch_sizeattr policy_mini_batch_sizeattr critic_mini_batch_sizeattr micro_train_batch_size_per_gpuattr micro_forward_batch_size_per_gpuattr update_ref_every_epochattr use_sample_packingattr eval_batch_sizeattr eval_before_trainattr eval_intervalattr max_prompt_lengthattr flash_attnattr disable_fast_tokenizerattr project_nameattr run_nameattr loggerattr dump_data_batchattr dump_eval_resultsattr rope_scalingattr rope_thetaclass OptimizerConfigfrom_dict_configattr lrattr adam_betasattr weight_decayattr max_grad_normattr offload_after_stepattr num_warmup_stepsattr schedulerclass MixedPrecisionConfigfrom_dict_configattr param_dtypeattr reduce_dtypeattr buffer_dtypeBackend Configclass FSDPConfigfrom_dict_configattr cpu_offloadattr reshard_after_forwardattr fsdp_sizeattr mixed_precisionattr wrap_policyclass MegatronConfigfrom_dict_configattr tensor_model_parallel_sizeattr pipeline_model_parallel_sizeattr context_parallel_sizeattr expert_model_parallel_sizeattr expert_tensor_parallel_sizeattr moe_token_dispatcher_typeattr moe_router_load_balancing_typeattr moe_grouped_gemmattr moe_router_score_functionattr moe_router_enable_expert_biasattr ddp_configattr torch_profiler_configattr lora_configattr optimizer_config_kwargsattr transformer_config_kwargsattr empty_cuda_cacheattr model_config_kwargsattr dist_ckpt_optim_fully_reshardableclass MegatronDDPConfigfrom_dict_configattr grad_reduce_in_fp32attr overlap_grad_reduceattr overlap_param_gatherattr average_in_collectiveclass MegatronLoraConfigfrom_dict_configattr lora_typeclass MegatronTorchProfilerConfigfrom_dict_configattr enableattr ranksattr save_pathPlacementclass PlacementConfigfrom_dict_configattr colocate_allattr colocate_policy_refattr policy_num_nodesattr policy_num_gpus_per_nodeattr critic_num_nodesattr critic_num_gpus_per_nodeattr ref_num_nodesattr ref_num_gpus_per_nodePolicy & Algorithmclass PolicyConfigfrom_dict_configattr modelattr optimizer_configattr fsdp_configattr sequence_parallel_sizeattr use_torch_compileattr record_memoryattr megatron_configattr model_config_kwargsclass CriticConfigfrom_dict_configattr modelattr optimizer_configattr fsdp_configattr sequence_parallel_sizeattr model_config_kwargsclass RefConfigfrom_dict_configattr modelattr sequence_parallel_sizeattr fsdp_configattr megatron_configattr model_config_kwargsclass AlgorithmConfigfrom_dict_configattr advantage_estimatorattr kl_ctrlattr kl_estimator_typeattr use_kl_in_rewardattr use_kl_lossattr kl_loss_coefattr use_entropy_lossattr entropy_loss_coefattr temperatureattr advantage_batch_normalizeattr value_head_prefixattr policy_loss_typeattr loss_reductionattr grpo_norm_by_stdattr zero_variance_filterattr lambdattr gammaattr eps_clip_lowattr eps_clip_highattr clip_ratio_cattr tis_imp_ratio_capattr use_tisattr off_policy_correctionattr sapoattr value_clipattr dynamic_samplingattr clip_covattr kl_covattr cispoattr max_seq_lenclass KLCtrlConfigfrom_dict_configattr typeattr kl_targetattr horizonAlgorithm Extensionsclass SAPOConfigfrom_dict_configattr tau_posattr tau_negclass DynamicSamplingConfigfrom_dict_configattr typeattr max_sample_batchesattr min_replace_ratioclass ClipCovConfigfrom_dict_configattr clip_ratioattr clip_cov_lbattr clip_cov_ubclass KLCovConfigfrom_dict_configattr kl_cov_fracattr ppo_kl_coefclass CISPOConfigfrom_dict_configattr cispo_eps_clip_lowattr cispo_eps_clip_highclass OffPolicyCorrectionConfigfrom_dict_configattr tis_ratio_typeattr token_tis_ratio_clip_highattr sequence_tis_ratio_clip_highattr sequence_mask_metricattr geo_mask_highattr geo_mask_lowattr product_mask_highattr product_mask_lowattr outlier_token_is_threshold_lowattr outlier_token_is_threshold_highattr token_mask_is_threshold_lowattr token_mask_is_threshold_highclass FullyAsyncConfigfrom_dict_configattr max_staleness_stepsattr num_parallel_generation_workersInference & Generationclass SamplingParamsfrom_dict_configattr max_generate_lengthattr repetition_penaltyattr temperatureattr top_pattr min_pattr top_kattr logprobsattr stopattr additional_kwargsclass ChatTemplateConfigfrom_dict_configattr sourceattr name_or_pathclass InferenceEngineConfigfrom_dict_configattr model_dtypeattr run_engines_locallyattr num_enginesattr backendattr weight_sync_backendattr weight_transfer_threshold_cuda_ipc_GBattr tensor_parallel_sizeattr pipeline_parallel_sizeattr expert_parallel_sizeattr data_parallel_sizeattr async_engineattr vllm_v1_disable_multiprocattr enable_prefix_cachingattr enable_chunked_prefillattr max_num_batched_tokensattr enforce_eagerattr fully_sharded_lorasattr enable_ray_prometheus_statsattr gpu_memory_utilizationattr max_num_seqsattr remote_urlsattr enable_http_endpointattr http_endpoint_hostattr http_endpoint_portattr served_model_nameattr distributed_executor_backendattr engine_init_kwargsattr override_existing_update_groupattr external_proxy_urlattr external_server_urlsclass GeneratorConfigfrom_dict_configattr inference_engineattr n_samples_per_promptattr batchedattr max_turnsattr max_input_lengthattr chat_templateattr chat_template_kwargsattr sampling_paramsattr use_conversation_multi_turnattr append_eos_token_after_stop_str_in_multi_turnattr eval_sampling_paramsattr eval_n_samples_per_promptattr zero_reward_on_non_stopattr apply_overlong_filteringattr rope_scalingattr rope_thetaattr step_wise_trajectoriesEnvironmentclass EnvironmentConfigfrom_dict_configattr env_classattr skyrl_gymclass SkyRLGymConfigfrom_dict_configattr max_env_workersattr text2sqlattr llm_as_a_judgeattr searchclass GSM8kLLMJudgeEnvConfigfrom_dict_configattr modelattr base_url