Configuration

Top-Level Config

The root configuration object and helpers.

class `SkyRLTrainConfig`

SkyRLTrainConfig(data: DataConfig = DataConfig(), trainer: TrainerConfig = TrainerConfig(), generator: GeneratorConfig = GeneratorConfig(), environment: EnvironmentConfig = EnvironmentConfig()) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.
`from_cli_overrides`	Construct a SkyRLTrainConfig from CLI arguments or a dict of overrides.

Attributes:

Name	Type	Description
`data`	DataConfig
`trainer`	TrainerConfig
`generator`	GeneratorConfig
`environment`	EnvironmentConfig

Source code in skyrl/train/config/config.py:826-930

@dataclass
class SkyRLTrainConfig(BaseConfig):
    data: DataConfig = field(default_factory=DataConfig)
    trainer: TrainerConfig = field(default_factory=TrainerConfig)
    generator: GeneratorConfig = field(default_factory=GeneratorConfig)
    environment: EnvironmentConfig = field(default_factory=EnvironmentConfig)

    def __post_init__(self):

        # generator.max_input_length defaults to trainer.max_prompt_length
        if self.generator.max_input_length is None:
            self.generator.max_input_length = self.trainer.max_prompt_length

        # generator rope params default to trainer rope params
        if self.generator.rope_scaling is None and self.trainer.rope_scaling is not None:
            self.generator.rope_scaling = self.trainer.rope_scaling
        if self.generator.rope_theta is None and self.trainer.rope_theta is not None:
            self.generator.rope_theta = self.trainer.rope_theta
        # Copy temperature from generator sampling params to algorithm config
        # so workers can access it without needing the generator config
        if self.trainer.algorithm.temperature is None:
            self.trainer.algorithm.temperature = self.generator.sampling_params.temperature

        if self.data.dataloader.num_workers is None:
            # TODO(Charlie): debug why inference http endpoint is slow when num_workers is 8
            self.data.dataloader.num_workers = 0 if self.generator.inference_engine.enable_http_endpoint else 8
        if self.data.dataloader.persistent_workers and self.data.dataloader.num_workers == 0:
            raise ValueError(
                "data.dataloader.persistent_workers requires num_workers > 0, but it was either"
                " set explicitly to 0 or forced to 0 by the inference HTTP endpoint."
            )

        # TODO(devpatel): Bandaid solution, replace this once we have a better
        # solution for LoRA performance degradation on the vLLM side
        from skyrl.backends.skyrl_train.inference_servers.utils import (
            _uses_lora_weight_sync,
        )

        ie_cfg = self.generator.inference_engine
        if _uses_lora_weight_sync(self) and ie_cfg.enforce_eager and ie_cfg.backend == "vllm":
            import warnings

            warnings.warn(
                "LoRA is enabled but inference_engine.enforce_eager=true. "
                "This combination causes significant performance degradation (2-3x slower generation). "
                "Automatically setting enforce_eager=false for better performance. "
            )
            ie_cfg.enforce_eager = False

    @classmethod
    def from_cli_overrides(cls, args: Union[List[str], dict]) -> "SkyRLTrainConfig":
        """Construct a SkyRLTrainConfig from CLI arguments or a dict of overrides.

        Parses CLI arguments and builds a typed config. Dataclass field defaults
        are used for any values not specified on the command line.

        Args:
            args: Either a list of CLI arguments in 'key.path=value' format, or a dict
                  mapping dot-notation keys to values.
                  Example list: ['trainer.policy.model.path=Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed=123']
                  Example dict: {'trainer.policy.model.path': 'Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed': 123}

        Returns:
            A fully constructed SkyRLTrainConfig with CLI overrides applied.

        Raises:
            ValueError: If an argument uses the unsupported '+' prefix.
        """
        if isinstance(args, dict):
            # OmegaConf's CLI parser only treats "null" as None; Python's
            # None stringifies to "None" which is parsed as the literal
            # string. Map None -> "null" so JSON-style overrides survive
            # the round-trip through OmegaConf.from_cli below.
            args = [f"{k}=null" if v is None else f"{k}={v}" for k, v in args.items()]

        # Check for unsupported '+' prefix
        for arg in args:
            if arg.startswith("+"):
                raise ValueError(
                    f"The '+' prefix for adding new config fields is not supported: '{arg}'. "
                    "To add custom config fields, subclass the relevant config dataclass."
                )
        overrides = OmegaConf.from_cli(args)
        # Accept the deprecated ``trainer.use_sample_packing`` key as an alias
        # for ``trainer.remove_microbatch_padding``. Remap it before
        # construction so the strict key validation does not reject the old
        # name.
        if "trainer" in overrides and "use_sample_packing" in overrides.trainer:
            if "remove_microbatch_padding" in overrides.trainer:
                raise ValueError(
                    "Specify only one of trainer.use_sample_packing (deprecated) and "
                    "trainer.remove_microbatch_padding, not both."
                )
            import warnings

            warnings.warn(
                "trainer.use_sample_packing has been renamed to "
                "trainer.remove_microbatch_padding; use "
                "trainer.remove_microbatch_padding instead.",
                DeprecationWarning,
                stacklevel=2,
            )
            overrides.trainer["remove_microbatch_padding"] = overrides.trainer["use_sample_packing"]
            del overrides.trainer["use_sample_packing"]
        return cls.from_dict_config(overrides)

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `data`

data: DataConfig = field(default_factory=DataConfig)

attr `trainer`

trainer: TrainerConfig = field(default_factory=TrainerConfig)

attr `generator`

generator: GeneratorConfig = field(default_factory=GeneratorConfig)

attr `environment`

environment: EnvironmentConfig = field(default_factory=EnvironmentConfig)

method classmethod `from_cli_overrides`

from_cli_overrides(args: Union[List[str], dict]) -> SkyRLTrainConfig

Construct a SkyRLTrainConfig from CLI arguments or a dict of overrides.

Parses CLI arguments and builds a typed config. Dataclass field defaults are used for any values not specified on the command line.

Parameters:

Name	Type	Description	Default
`args`	Union[List[str], dict]	Either a list of CLI arguments in 'key.path=value' format, or a dict mapping dot-notation keys to values. Example list: ['trainer.policy.model.path=Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed=123'] Example dict: {'trainer.policy.model.path': 'Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed': 123}	required

Returns:

Type	Description
SkyRLTrainConfig	A fully constructed SkyRLTrainConfig with CLI overrides applied.

Raises:

Type	Description
ValueError	If an argument uses the unsupported '+' prefix.

Source code in skyrl/train/config/config.py:875-930

    @classmethod
    def from_cli_overrides(cls, args: Union[List[str], dict]) -> "SkyRLTrainConfig":
        """Construct a SkyRLTrainConfig from CLI arguments or a dict of overrides.

        Parses CLI arguments and builds a typed config. Dataclass field defaults
        are used for any values not specified on the command line.

        Args:
            args: Either a list of CLI arguments in 'key.path=value' format, or a dict
                  mapping dot-notation keys to values.
                  Example list: ['trainer.policy.model.path=Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed=123']
                  Example dict: {'trainer.policy.model.path': 'Qwen/Qwen2.5-1.5B-Instruct', 'trainer.seed': 123}

        Returns:
            A fully constructed SkyRLTrainConfig with CLI overrides applied.

        Raises:
            ValueError: If an argument uses the unsupported '+' prefix.
        """
        if isinstance(args, dict):
            # OmegaConf's CLI parser only treats "null" as None; Python's
            # None stringifies to "None" which is parsed as the literal
            # string. Map None -> "null" so JSON-style overrides survive
            # the round-trip through OmegaConf.from_cli below.
            args = [f"{k}=null" if v is None else f"{k}={v}" for k, v in args.items()]

        # Check for unsupported '+' prefix
        for arg in args:
            if arg.startswith("+"):
                raise ValueError(
                    f"The '+' prefix for adding new config fields is not supported: '{arg}'. "
                    "To add custom config fields, subclass the relevant config dataclass."
                )
        overrides = OmegaConf.from_cli(args)
        # Accept the deprecated ``trainer.use_sample_packing`` key as an alias
        # for ``trainer.remove_microbatch_padding``. Remap it before
        # construction so the strict key validation does not reject the old
        # name.
        if "trainer" in overrides and "use_sample_packing" in overrides.trainer:
            if "remove_microbatch_padding" in overrides.trainer:
                raise ValueError(
                    "Specify only one of trainer.use_sample_packing (deprecated) and "
                    "trainer.remove_microbatch_padding, not both."
                )
            import warnings

            warnings.warn(
                "trainer.use_sample_packing has been renamed to "
                "trainer.remove_microbatch_padding; use "
                "trainer.remove_microbatch_padding instead.",
                DeprecationWarning,
                stacklevel=2,
            )
            overrides.trainer["remove_microbatch_padding"] = overrides.trainer["use_sample_packing"]
            del overrides.trainer["use_sample_packing"]
        return cls.from_dict_config(overrides)

method `make_config`

make_config(algorithm_cls: Optional[Type[AlgorithmConfig]] = None, trainer_cls: Optional[Type[TrainerConfig]] = None, generator_cls: Optional[Type[GeneratorConfig]] = None) -> Type[SkyRLTrainConfig]

Create a SkyRLTrainConfig subclass with custom nested config classes.

Convenience helper to avoid boilerplate when extending configs for custom algorithms or generators. For full IDE autocomplete on custom fields, use explicit subclassing instead (see examples/algorithms/dapo/main_dapo.py).

Parameters:

Name	Type	Description	Default
`algorithm_cls`	Optional[Type[AlgorithmConfig]]	Custom AlgorithmConfig subclass. If provided without trainer_cls, a TrainerConfig subclass is automatically created.	`None`
`trainer_cls`	Optional[Type[TrainerConfig]]	Custom TrainerConfig subclass. Takes precedence over algorithm_cls for the trainer config.	`None`
`generator_cls`	Optional[Type[GeneratorConfig]]	Custom GeneratorConfig subclass.	`None`

Returns:

Type	Description
Type[SkyRLTrainConfig]	A SkyRLTrainConfig subclass wired up with the custom config classes.

Example::

@dataclass class MyAlgorithmConfig(AlgorithmConfig): my_param: int = 42

MyConfig = make_config(algorithm_cls=MyAlgorithmConfig) cfg = MyConfig.from_cli_overrides(sys.argv[1:])

method `get_config_as_dict`

get_config_as_dict(cfg: Union[dict, BaseConfig]) -> dict

method `get_config_as_yaml_str`

get_config_as_yaml_str(cfg: BaseConfig) -> str

Data & Model

class `DataConfig`

DataConfig(train_data: List[str] = (lambda: [os.path.expanduser('~/data/gsm8k/train.parquet')])(), val_data: List[str] = (lambda: [os.path.expanduser('~/data/gsm8k/validation.parquet')])(), dataloader: DataLoaderConfig = DataLoaderConfig()) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`train_data`	List[str]
`val_data`	List[str]
`dataloader`	DataLoaderConfig

Source code in skyrl/train/config/config.py:68-72

@dataclass
class DataConfig(BaseConfig):
    train_data: List[str] = field(default_factory=lambda: [os.path.expanduser("~/data/gsm8k/train.parquet")])
    val_data: List[str] = field(default_factory=lambda: [os.path.expanduser("~/data/gsm8k/validation.parquet")])
    dataloader: DataLoaderConfig = field(default_factory=DataLoaderConfig)

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `train_data`

train_data: List[str] = field(default_factory=(lambda: [os.path.expanduser('~/data/gsm8k/train.parquet')]))

attr `val_data`

val_data: List[str] = field(default_factory=(lambda: [os.path.expanduser('~/data/gsm8k/validation.parquet')]))

attr `dataloader`

dataloader: DataLoaderConfig = field(default_factory=DataLoaderConfig)

class `ModelConfig`

ModelConfig(path: Optional[str] = None, lora: SkyRLLoraConfig = SkyRLLoraConfig()) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`path`	Optional[str]
`lora`	SkyRLLoraConfig

Source code in skyrl/train/config/config.py:104-107

@dataclass
class ModelConfig(BaseConfig):
    path: Optional[str] = None
    lora: SkyRLLoraConfig = field(default_factory=SkyRLLoraConfig)

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `path`

path: Optional[str] = None

attr `lora`

lora: SkyRLLoraConfig = field(default_factory=SkyRLLoraConfig)

class `SkyRLLoraConfig`

SkyRLLoraConfig(rank: int = 0, alpha: int = 16, dropout: float = 0.0, lora_sync_path: str = '/tmp/skyrl_lora_sync', target_modules: str = 'all-linear', exclude_modules: Optional[str] = None, init_method: str = 'kaiming', max_loras: int = 1, max_cpu_loras: Optional[int] = None) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`rank`	int
`alpha`	int
`dropout`	float
`lora_sync_path`	str
`target_modules`	str
`exclude_modules`	Optional[str]
`init_method`	str	For FSDP, corresponds to `init_lora_weights` in PEFT.
`max_loras`	int	Maximum number of LoRA adapters that can be active concurrently in a
`max_cpu_loras`	Optional[int]	Total LoRA adapter capacity in vLLM's CPU LRU cache. Maps to vLLM's

Source code in skyrl/train/config/config.py:81-101

@dataclass
class SkyRLLoraConfig(BaseConfig):
    rank: int = 0
    alpha: int = 16
    dropout: float = 0.0
    lora_sync_path: str = "/tmp/skyrl_lora_sync"
    target_modules: str = "all-linear"
    exclude_modules: Optional[str] = None
    init_method: str = "kaiming"
    """For FSDP, corresponds to ``init_lora_weights`` in PEFT.
    For Megatron, used for ``lora_A_init_method``; supports "xavier", "normal", "kaiming", "zero"."""

    max_loras: int = 1
    """Maximum number of LoRA adapters that can be active concurrently in a
    single GPU batch. Maps to vLLM's ``max_loras``. Increase past 1 to enable
    multi-tenant LoRA serving via ``RemoteInferenceClient.load_lora_adapter``."""

    max_cpu_loras: Optional[int] = None
    """Total LoRA adapter capacity in vLLM's CPU LRU cache. Maps to vLLM's
    ``max_cpu_loras``; when None, vLLM defaults it to ``max_loras``. Must be
    >= ``max_loras`` if explicitly set."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `rank`

rank: int = 0

attr `alpha`

alpha: int = 16

attr `dropout`

dropout: float = 0.0

attr `lora_sync_path`

lora_sync_path: str = '/tmp/skyrl_lora_sync'

attr `target_modules`

target_modules: str = 'all-linear'

attr `exclude_modules`

exclude_modules: Optional[str] = None

attr `init_method`

init_method: str = 'kaiming'

For FSDP, corresponds to init_lora_weights in PEFT. For Megatron, used for lora_A_init_method; supports "xavier", "normal", "kaiming", "zero".

attr `max_loras`

max_loras: int = 1

Maximum number of LoRA adapters that can be active concurrently in a single GPU batch. Maps to vLLM's max_loras. Increase past 1 to enable multi-tenant LoRA serving via RemoteInferenceClient.load_lora_adapter.

attr `max_cpu_loras`

max_cpu_loras: Optional[int] = None

Total LoRA adapter capacity in vLLM's CPU LRU cache. Maps to vLLM's max_cpu_loras; when None, vLLM defaults it to max_loras. Must be

= max_loras if explicitly set.

Training

class `TrainerConfig`

TrainerConfig(placement: PlacementConfig = PlacementConfig(), sequence_parallel_backend: str = 'ulysses', strategy: str = 'fsdp', policy: PolicyConfig = PolicyConfig(), ref: RefConfig = RefConfig(), critic: CriticConfig = CriticConfig(), algorithm: AlgorithmConfig = AlgorithmConfig(), fully_async: FullyAsyncConfig = FullyAsyncConfig(), gradient_checkpointing: bool = True, gradient_checkpointing_use_reentrant: bool = False, seed: int = 42, resume_mode: Optional[str] = 'latest', resume_path: Optional[str] = None, log_path: str = '/tmp/skyrl-logs', ckpt_path: str = (lambda: os.path.expanduser('~/ckpts/'))(), max_ckpts_to_keep: int = -1, ckpt_interval: int = 10, hf_save_interval: int = -1, export_path: str = (lambda: os.path.expanduser('~/exports/'))(), bf16: bool = True, epochs: int = 1, update_epochs_per_batch: int = 1, train_batch_size: int = 1024, policy_mini_batch_size: int = 256, critic_mini_batch_size: int = 256, micro_train_batch_size_per_gpu: int = 1, micro_forward_batch_size_per_gpu: int = 1, update_ref_every_epoch: bool = False, remove_microbatch_padding: bool = True, eval_batch_size: int = 1024, eval_before_train: bool = True, eval_interval: int = 5, max_prompt_length: int = 512, flash_attn: bool = True, disable_fast_tokenizer: bool = False, project_name: str = 'skyrl', run_name: str = 'test_run', logger: str = 'wandb', enable_ray_gpu_monitor: bool = True, tags: Optional[List[str]] = None, dump_data_batch: bool = False, dump_eval_results: bool = True, rope_scaling: Optional[Dict[str, Any]] = None, rope_theta: Optional[float] = None, log_example_interval: int = 1, logprobs_chunk_size: Optional[int] = 1024) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`placement`	PlacementConfig
`sequence_parallel_backend`	str
`strategy`	str
`policy`	PolicyConfig
`ref`	RefConfig
`critic`	CriticConfig
`algorithm`	AlgorithmConfig
`fully_async`	FullyAsyncConfig
`gradient_checkpointing`	bool
`gradient_checkpointing_use_reentrant`	bool
`seed`	int
`resume_mode`	Optional[str]	`None`/`"none"`, `"latest"`, or `"from_path"`.
`resume_path`	Optional[str]
`log_path`	str	Path for infrastructure log files. For multi-node, use a shared filesystem path to consolidate logs.
`ckpt_path`	str
`max_ckpts_to_keep`	int	`-1` to keep all checkpoints, `N` to keep only the last N.
`ckpt_interval`	int
`hf_save_interval`	int	Save HuggingFace-format model every N steps. `-1` to disable.
`export_path`	str	Path for exported artifacts (HF models, debug dumps, etc.).
`bf16`	bool
`epochs`	int
`update_epochs_per_batch`	int	Number of gradient update passes over each training batch.
`train_batch_size`	int	See `utils/utils.py::validate_batch_sizes` for train, mini, and micro batch size constraints.
`policy_mini_batch_size`	int
`critic_mini_batch_size`	int
`micro_train_batch_size_per_gpu`	int
`micro_forward_batch_size_per_gpu`	int
`update_ref_every_epoch`	bool
`remove_microbatch_padding`	bool	Pack samples into the THD layout and strip intra-microbatch padding (requires flash attention).
`eval_batch_size`	int
`eval_before_train`	bool
`eval_interval`	int	`-1` to disable evaluation.
`max_prompt_length`	int
`flash_attn`	bool
`disable_fast_tokenizer`	bool
`project_name`	str
`run_name`	str
`logger`	str
`enable_ray_gpu_monitor`	bool	Enable background Ray GPU/RAM metrics collection and logging to wandb.
`tags`	Optional[List[str]]	Optional list of tags to apply to the W&B run. Has no effect on other backends.
`dump_data_batch`	bool
`dump_eval_results`	bool
`rope_scaling`	Optional[Dict[str, Any]]
`rope_theta`	Optional[float]
`log_example_interval`	int	Log an example prompt every N training steps, `0`/`-1` to disable
`logprobs_chunk_size`	Optional[int]	Chunk size along the sequence dimension when computing log-probs from logits.

Source code in skyrl/train/config/config.py:659-739

@dataclass
class TrainerConfig(BaseConfig):
    placement: PlacementConfig = field(default_factory=PlacementConfig)
    sequence_parallel_backend: str = "ulysses"
    strategy: str = "fsdp"
    policy: PolicyConfig = field(default_factory=PolicyConfig)
    ref: RefConfig = field(default_factory=RefConfig)
    critic: CriticConfig = field(default_factory=CriticConfig)
    algorithm: AlgorithmConfig = field(default_factory=AlgorithmConfig)
    fully_async: FullyAsyncConfig = field(default_factory=FullyAsyncConfig)
    gradient_checkpointing: bool = True
    gradient_checkpointing_use_reentrant: bool = False
    seed: int = 42
    resume_mode: Optional[str] = "latest"
    """``None``/``"none"``, ``"latest"``, or ``"from_path"``."""
    resume_path: Optional[str] = None
    log_path: str = "/tmp/skyrl-logs"
    """Path for infrastructure log files. For multi-node, use a shared filesystem path to consolidate logs."""
    ckpt_path: str = field(default_factory=lambda: os.path.expanduser("~/ckpts/"))
    max_ckpts_to_keep: int = -1
    """``-1`` to keep all checkpoints, ``N`` to keep only the last N."""
    ckpt_interval: int = 10
    hf_save_interval: int = -1
    """Save HuggingFace-format model every N steps. ``-1`` to disable."""
    export_path: str = field(default_factory=lambda: os.path.expanduser("~/exports/"))
    """Path for exported artifacts (HF models, debug dumps, etc.)."""
    bf16: bool = True
    epochs: int = 1
    update_epochs_per_batch: int = 1
    """Number of gradient update passes over each training batch."""
    train_batch_size: int = 1024
    """See ``utils/utils.py::validate_batch_sizes`` for train, mini, and micro batch size constraints."""
    policy_mini_batch_size: int = 256
    critic_mini_batch_size: int = 256
    micro_train_batch_size_per_gpu: int = 1
    micro_forward_batch_size_per_gpu: int = 1
    update_ref_every_epoch: bool = False
    remove_microbatch_padding: bool = True
    """Pack samples into the THD layout and strip intra-microbatch padding (requires flash attention)."""
    eval_batch_size: int = 1024
    eval_before_train: bool = True
    eval_interval: int = 5
    """``-1`` to disable evaluation."""
    max_prompt_length: int = 512
    flash_attn: bool = True
    disable_fast_tokenizer: bool = False
    project_name: str = "skyrl"
    run_name: str = "test_run"
    logger: str = "wandb"
    enable_ray_gpu_monitor: bool = True
    """Enable background Ray GPU/RAM metrics collection and logging to wandb."""
    tags: Optional[List[str]] = None
    """Optional list of tags to apply to the W&B run. Has no effect on other backends."""
    dump_data_batch: bool = False
    dump_eval_results: bool = True
    rope_scaling: Optional[Dict[str, Any]] = None
    rope_theta: Optional[float] = None
    log_example_interval: int = 1
    """Log an example prompt every N training steps, ``0``/``-1`` to disable"""
    logprobs_chunk_size: Optional[int] = 1024
    """Chunk size along the sequence dimension when computing log-probs from logits.
    This lowers peak GPU memory at the cost of ~2x wall-clock time.
    ``None`` disables chunking (Megatron backend only; FSDP requires a positive int).
    See https://github.com/NovaSky-AI/SkyRL/pull/1610 for more details."""

    def __post_init__(self):
        # ref model defaults to the policy model
        if self.ref.model.path is None:
            self.ref.model.path = self.policy.model.path

        if self.logprobs_chunk_size is not None and (
            not isinstance(self.logprobs_chunk_size, int) or self.logprobs_chunk_size <= 0
        ):
            raise ValueError(
                f"logprobs_chunk_size must be a positive integer or None, got {self.logprobs_chunk_size!r}."
            )
        if self.logprobs_chunk_size is None and self.strategy != "megatron":
            raise ValueError(
                "logprobs_chunk_size=None (no chunking) is only supported with the Megatron backend. "
                f"Set a positive integer for strategy={self.strategy!r}."
            )

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `placement`

placement: PlacementConfig = field(default_factory=PlacementConfig)

attr `sequence_parallel_backend`

sequence_parallel_backend: str = 'ulysses'

attr `strategy`

strategy: str = 'fsdp'

attr `policy`

policy: PolicyConfig = field(default_factory=PolicyConfig)

attr `ref`

ref: RefConfig = field(default_factory=RefConfig)

attr `critic`

critic: CriticConfig = field(default_factory=CriticConfig)

attr `algorithm`

algorithm: AlgorithmConfig = field(default_factory=AlgorithmConfig)

attr `fully_async`

fully_async: FullyAsyncConfig = field(default_factory=FullyAsyncConfig)

attr `gradient_checkpointing`

gradient_checkpointing: bool = True

attr `gradient_checkpointing_use_reentrant`

gradient_checkpointing_use_reentrant: bool = False

attr `seed`

seed: int = 42

attr `resume_mode`

resume_mode: Optional[str] = 'latest'

None/"none", "latest", or "from_path".

attr `resume_path`

resume_path: Optional[str] = None

attr `log_path`

log_path: str = '/tmp/skyrl-logs'

Path for infrastructure log files. For multi-node, use a shared filesystem path to consolidate logs.

attr `ckpt_path`

ckpt_path: str = field(default_factory=(lambda: os.path.expanduser('~/ckpts/')))

attr `max_ckpts_to_keep`

max_ckpts_to_keep: int = -1

-1 to keep all checkpoints, N to keep only the last N.

attr `ckpt_interval`

ckpt_interval: int = 10

attr `hf_save_interval`

hf_save_interval: int = -1

Save HuggingFace-format model every N steps. -1 to disable.

attr `export_path`

export_path: str = field(default_factory=(lambda: os.path.expanduser('~/exports/')))

Path for exported artifacts (HF models, debug dumps, etc.).

attr `bf16`

bf16: bool = True

attr `epochs`

epochs: int = 1

attr `update_epochs_per_batch`

update_epochs_per_batch: int = 1

Number of gradient update passes over each training batch.

attr `train_batch_size`

train_batch_size: int = 1024

See utils/utils.py::validate_batch_sizes for train, mini, and micro batch size constraints.

attr `policy_mini_batch_size`

policy_mini_batch_size: int = 256

attr `critic_mini_batch_size`

critic_mini_batch_size: int = 256

attr `micro_train_batch_size_per_gpu`

micro_train_batch_size_per_gpu: int = 1

attr `micro_forward_batch_size_per_gpu`

micro_forward_batch_size_per_gpu: int = 1

attr `update_ref_every_epoch`

update_ref_every_epoch: bool = False

attr `remove_microbatch_padding`

remove_microbatch_padding: bool = True

Pack samples into the THD layout and strip intra-microbatch padding (requires flash attention).

attr `eval_batch_size`

eval_batch_size: int = 1024

attr `eval_before_train`

eval_before_train: bool = True

attr `eval_interval`

eval_interval: int = 5

-1 to disable evaluation.

attr `max_prompt_length`

max_prompt_length: int = 512

attr `flash_attn`

flash_attn: bool = True

attr `disable_fast_tokenizer`

disable_fast_tokenizer: bool = False

attr `project_name`

project_name: str = 'skyrl'

attr `run_name`

run_name: str = 'test_run'

attr `logger`

logger: str = 'wandb'

attr `enable_ray_gpu_monitor`

enable_ray_gpu_monitor: bool = True

Enable background Ray GPU/RAM metrics collection and logging to wandb.

attr `tags`

tags: Optional[List[str]] = None

Optional list of tags to apply to the W&B run. Has no effect on other backends.

attr `dump_data_batch`

dump_data_batch: bool = False

attr `dump_eval_results`

dump_eval_results: bool = True

attr `rope_scaling`

rope_scaling: Optional[Dict[str, Any]] = None

attr `rope_theta`

rope_theta: Optional[float] = None

attr `log_example_interval`

log_example_interval: int = 1

Log an example prompt every N training steps, 0/-1 to disable

attr `logprobs_chunk_size`

logprobs_chunk_size: Optional[int] = 1024

Chunk size along the sequence dimension when computing log-probs from logits. This lowers peak GPU memory at the cost of ~2x wall-clock time. None disables chunking (Megatron backend only; FSDP requires a positive int). See https://github.com/NovaSky-AI/SkyRL/pull/1610 for more details.

class `OptimizerConfig`

OptimizerConfig(lr: float = 1e-06, adam_betas: List[float] = (lambda: [0.9, 0.999])(), weight_decay: float = 0.01, max_grad_norm: float = 1.0, offload_after_step: bool = True, num_warmup_steps: int = 0, scheduler: str = 'constant_with_warmup') -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`lr`	float
`adam_betas`	List[float]
`weight_decay`	float
`max_grad_norm`	float
`offload_after_step`	bool	Offload optimizer state to CPU after each full training step. Only applicable when `colocate_all=True`.
`num_warmup_steps`	int	Number of mini-batch steps to warmup the optimizer.
`scheduler`	str

Source code in skyrl/train/config/config.py:115-125

@dataclass
class OptimizerConfig(BaseConfig):
    lr: float = 1e-6
    adam_betas: List[float] = field(default_factory=lambda: [0.9, 0.999])
    weight_decay: float = 1e-2
    max_grad_norm: float = 1.0
    offload_after_step: bool = True
    """Offload optimizer state to CPU after each full training step. Only applicable when ``colocate_all=True``."""
    num_warmup_steps: int = 0
    """Number of mini-batch steps to warmup the optimizer."""
    scheduler: str = "constant_with_warmup"

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `lr`

lr: float = 1e-06

attr `adam_betas`

adam_betas: List[float] = field(default_factory=(lambda: [0.9, 0.999]))

attr `weight_decay`

weight_decay: float = 0.01

attr `max_grad_norm`

max_grad_norm: float = 1.0

attr `offload_after_step`

offload_after_step: bool = True

Offload optimizer state to CPU after each full training step. Only applicable when colocate_all=True.

attr `num_warmup_steps`

num_warmup_steps: int = 0

Number of mini-batch steps to warmup the optimizer.

attr `scheduler`

scheduler: str = 'constant_with_warmup'

class `MixedPrecisionConfig`

MixedPrecisionConfig(param_dtype: str = 'bf16', reduce_dtype: str = 'fp32', buffer_dtype: str = 'fp32') -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`param_dtype`	str
`reduce_dtype`	str
`buffer_dtype`	str

Source code in skyrl/train/config/config.py:128-132

@dataclass
class MixedPrecisionConfig(BaseConfig):
    param_dtype: str = "bf16"
    reduce_dtype: str = "fp32"
    buffer_dtype: str = "fp32"

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `param_dtype`

param_dtype: str = 'bf16'

attr `reduce_dtype`

reduce_dtype: str = 'fp32'

attr `buffer_dtype`

buffer_dtype: str = 'fp32'

Backend Config

class `FSDPConfig`

FSDPConfig(cpu_offload: bool = False, reshard_after_forward: Union[bool, int] = True, fsdp_size: int = -1, mixed_precision: Optional[MixedPrecisionConfig] = None, wrap_policy: dict = dict()) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`cpu_offload`	bool	Offload params and optimizer state to CPU during the forward pass.
`reshard_after_forward`	Union[bool, int]	FSDP2 only. Accepts True, False, or an int between 1 and `fsdp_size`.
`fsdp_size`	int
`mixed_precision`	Optional[MixedPrecisionConfig]
`wrap_policy`	dict

Source code in skyrl/train/config/config.py:135-144

@dataclass
class FSDPConfig(BaseConfig):
    cpu_offload: bool = False
    """Offload params and optimizer state to CPU during the forward pass."""
    reshard_after_forward: Union[bool, int] = True
    """FSDP2 only. Accepts True, False, or an int between 1 and ``fsdp_size``."""
    fsdp_size: int = -1
    mixed_precision: Optional[MixedPrecisionConfig] = None
    # specify wrap policy as a dict with `transformer_layer_cls_to_wrap` key for custom module based wrapping
    wrap_policy: dict = field(default_factory=dict)

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `cpu_offload`

cpu_offload: bool = False

Offload params and optimizer state to CPU during the forward pass.

attr `reshard_after_forward`

reshard_after_forward: Union[bool, int] = True

FSDP2 only. Accepts True, False, or an int between 1 and fsdp_size.

attr `fsdp_size`

fsdp_size: int = -1

attr `mixed_precision`

mixed_precision: Optional[MixedPrecisionConfig] = None

attr `wrap_policy`

wrap_policy: dict = field(default_factory=dict)

class `MegatronConfig`

MegatronConfig(tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, context_parallel_size: int = 1, expert_model_parallel_size: int = 1, expert_tensor_parallel_size: Optional[int] = None, moe_token_dispatcher_type: str = 'alltoall', moe_router_load_balancing_type: str = 'none', moe_aux_loss_coeff: float = 0.0, moe_grouped_gemm: bool = True, moe_router_score_function: Optional[str] = None, moe_router_enable_expert_bias: Optional[bool] = None, moe_enable_routing_replay: bool = False, moe_per_layer_logging: bool = False, moe_router_dtype: str = 'fp32', ddp_config: MegatronDDPConfig = MegatronDDPConfig(), torch_profiler_config: MegatronTorchProfilerConfig = MegatronTorchProfilerConfig(), lora_config: MegatronLoraConfig = MegatronLoraConfig(), optimizer_config_kwargs: Dict[str, Any] = (lambda: copy.deepcopy(DEFAULT_MEGATRON_OPTIMIZER_KWARGS))(), transformer_config_kwargs: Dict[str, Any] = (lambda: copy.deepcopy(DEFAULT_TRANSFORMER_CONFIG_KWARGS))(), empty_cuda_cache: Optional[bool] = True, model_config_kwargs: dict = dict(), dist_ckpt_optim_fully_reshardable: bool = False, freeze_moe_router: bool = False) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`tensor_model_parallel_size`	int
`pipeline_model_parallel_size`	int
`context_parallel_size`	int
`expert_model_parallel_size`	int
`expert_tensor_parallel_size`	Optional[int]
`moe_token_dispatcher_type`	str
`moe_router_load_balancing_type`	str	Set to "aux_loss", "seq_aux_loss", or "global_aux_loss" to enable aux loss-based load balancing and logging.
`moe_aux_loss_coeff`	float	Scaling coefficient for the moe load balancing loss if moe_router_load_balancing_type is not 'none'. Will disable aux loss in megatron-core if set to 0.
`moe_grouped_gemm`	bool
`moe_router_score_function`	Optional[str]
`moe_router_enable_expert_bias`	Optional[bool]
`moe_enable_routing_replay`	bool
`moe_per_layer_logging`	bool	Enable per-layer logging of MoE metrics (i.e. per layer aux losses).
`moe_router_dtype`	str	Pass through to Megatron-Bridge - can be set to 'fp64' for additional numerical stability.
`ddp_config`	MegatronDDPConfig
`torch_profiler_config`	MegatronTorchProfilerConfig
`lora_config`	MegatronLoraConfig
`optimizer_config_kwargs`	Dict[str, Any]
`transformer_config_kwargs`	Dict[str, Any]
`empty_cuda_cache`	Optional[bool]
`model_config_kwargs`	dict
`dist_ckpt_optim_fully_reshardable`	bool
`freeze_moe_router`	bool	If True, freeze MoE router parameters so they are not updated during training. No-op on

Source code in skyrl/train/config/config.py:189-236

@dataclass
class MegatronConfig(BaseConfig):
    tensor_model_parallel_size: int = 1
    pipeline_model_parallel_size: int = 1
    context_parallel_size: int = 1
    expert_model_parallel_size: int = 1
    expert_tensor_parallel_size: Optional[int] = None
    # MoE runtime configuration flags
    moe_token_dispatcher_type: str = "alltoall"
    moe_router_load_balancing_type: str = "none"
    """Set to "aux_loss", "seq_aux_loss", or "global_aux_loss" to enable aux loss-based load balancing and logging."""
    moe_aux_loss_coeff: float = 0.0
    """Scaling coefficient for the moe load balancing loss if moe_router_load_balancing_type is not 'none'. Will disable aux loss in megatron-core if set to 0."""
    moe_grouped_gemm: bool = True
    moe_router_score_function: Optional[str] = None
    moe_router_enable_expert_bias: Optional[bool] = None
    moe_enable_routing_replay: bool = False
    moe_per_layer_logging: bool = False
    """Enable per-layer logging of MoE metrics (i.e. per layer aux losses)."""
    moe_router_dtype: str = "fp32"
    """Pass through to Megatron-Bridge - can be set to 'fp64' for additional numerical stability."""
    ddp_config: MegatronDDPConfig = field(default_factory=MegatronDDPConfig)
    torch_profiler_config: MegatronTorchProfilerConfig = field(default_factory=MegatronTorchProfilerConfig)
    lora_config: MegatronLoraConfig = field(default_factory=MegatronLoraConfig)
    optimizer_config_kwargs: Dict[str, Any] = field(
        default_factory=lambda: copy.deepcopy(DEFAULT_MEGATRON_OPTIMIZER_KWARGS)
    )
    transformer_config_kwargs: Dict[str, Any] = field(
        default_factory=lambda: copy.deepcopy(DEFAULT_TRANSFORMER_CONFIG_KWARGS)
    )
    empty_cuda_cache: Optional[bool] = True
    model_config_kwargs: dict = field(default_factory=dict)
    dist_ckpt_optim_fully_reshardable: bool = False
    freeze_moe_router: bool = False
    """If True, freeze MoE router parameters so they are not updated during training. No-op on
    non-MoE models."""

    def __post_init__(self):
        # Backfill defaults for any keys the user didn't override so an override dict
        # doesn't have to repeat every default just to set one value.
        if self.transformer_config_kwargs is None:
            self.transformer_config_kwargs = {}
        for k, v in DEFAULT_TRANSFORMER_CONFIG_KWARGS.items():
            self.transformer_config_kwargs.setdefault(k, copy.deepcopy(v))
        if self.optimizer_config_kwargs is None:
            self.optimizer_config_kwargs = {}
        for k, v in DEFAULT_MEGATRON_OPTIMIZER_KWARGS.items():
            self.optimizer_config_kwargs.setdefault(k, copy.deepcopy(v))

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `tensor_model_parallel_size`

tensor_model_parallel_size: int = 1

attr `pipeline_model_parallel_size`

pipeline_model_parallel_size: int = 1

attr `context_parallel_size`

context_parallel_size: int = 1

attr `expert_model_parallel_size`

expert_model_parallel_size: int = 1

attr `expert_tensor_parallel_size`

expert_tensor_parallel_size: Optional[int] = None

attr `moe_token_dispatcher_type`

moe_token_dispatcher_type: str = 'alltoall'

attr `moe_router_load_balancing_type`

moe_router_load_balancing_type: str = 'none'

Set to "aux_loss", "seq_aux_loss", or "global_aux_loss" to enable aux loss-based load balancing and logging.

attr `moe_aux_loss_coeff`

moe_aux_loss_coeff: float = 0.0

Scaling coefficient for the moe load balancing loss if moe_router_load_balancing_type is not 'none'. Will disable aux loss in megatron-core if set to 0.

attr `moe_grouped_gemm`

moe_grouped_gemm: bool = True

attr `moe_router_score_function`

moe_router_score_function: Optional[str] = None

attr `moe_router_enable_expert_bias`

moe_router_enable_expert_bias: Optional[bool] = None

attr `moe_enable_routing_replay`

moe_enable_routing_replay: bool = False

attr `moe_per_layer_logging`

moe_per_layer_logging: bool = False

Enable per-layer logging of MoE metrics (i.e. per layer aux losses).

attr `moe_router_dtype`

moe_router_dtype: str = 'fp32'

Pass through to Megatron-Bridge - can be set to 'fp64' for additional numerical stability.

attr `ddp_config`

ddp_config: MegatronDDPConfig = field(default_factory=MegatronDDPConfig)

attr `torch_profiler_config`

torch_profiler_config: MegatronTorchProfilerConfig = field(default_factory=MegatronTorchProfilerConfig)

attr `lora_config`

lora_config: MegatronLoraConfig = field(default_factory=MegatronLoraConfig)

attr `optimizer_config_kwargs`

optimizer_config_kwargs: Dict[str, Any] = field(default_factory=(lambda: copy.deepcopy(DEFAULT_MEGATRON_OPTIMIZER_KWARGS)))

attr `transformer_config_kwargs`

transformer_config_kwargs: Dict[str, Any] = field(default_factory=(lambda: copy.deepcopy(DEFAULT_TRANSFORMER_CONFIG_KWARGS)))

attr `empty_cuda_cache`

empty_cuda_cache: Optional[bool] = True

attr `model_config_kwargs`

model_config_kwargs: dict = field(default_factory=dict)

attr `dist_ckpt_optim_fully_reshardable`

dist_ckpt_optim_fully_reshardable: bool = False

attr `freeze_moe_router`

freeze_moe_router: bool = False

If True, freeze MoE router parameters so they are not updated during training. No-op on non-MoE models.

class `MegatronDDPConfig`

MegatronDDPConfig(grad_reduce_in_fp32: bool = True, overlap_grad_reduce: bool = False, overlap_param_gather: bool = False, average_in_collective: bool = True) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`grad_reduce_in_fp32`	bool
`overlap_grad_reduce`	bool
`overlap_param_gather`	bool
`average_in_collective`	bool

Source code in skyrl/train/config/config.py:152-157

@dataclass
class MegatronDDPConfig(BaseConfig):
    grad_reduce_in_fp32: bool = True
    overlap_grad_reduce: bool = False
    overlap_param_gather: bool = False
    average_in_collective: bool = True

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `grad_reduce_in_fp32`

grad_reduce_in_fp32: bool = True

attr `overlap_grad_reduce`

overlap_grad_reduce: bool = False

attr `overlap_param_gather`

overlap_param_gather: bool = False

attr `average_in_collective`

average_in_collective: bool = True

class `MegatronLoraConfig`

MegatronLoraConfig(lora_type: str = 'lora', merge_lora: bool = True) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`lora_type`	str
`merge_lora`	bool

Source code in skyrl/train/config/config.py:167-170

@dataclass
class MegatronLoraConfig(BaseConfig):
    lora_type: str = "lora"
    merge_lora: bool = True

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `lora_type`

lora_type: str = 'lora'

attr `merge_lora`

merge_lora: bool = True

class `MegatronTorchProfilerConfig`

MegatronTorchProfilerConfig(enable: bool = False, ranks: List[int] = list(), save_path: Optional[str] = None) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`enable`	bool
`ranks`	List[int]
`save_path`	Optional[str]

Source code in skyrl/train/config/config.py:160-164

@dataclass
class MegatronTorchProfilerConfig(BaseConfig):
    enable: bool = False
    ranks: List[int] = field(default_factory=list)
    save_path: Optional[str] = None

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `enable`

enable: bool = False

attr `ranks`

ranks: List[int] = field(default_factory=list)

attr `save_path`

save_path: Optional[str] = None

Placement

class `PlacementConfig`

PlacementConfig(colocate_all: bool = True, colocate_policy_ref: bool = True, policy_num_nodes: int = 1, policy_num_gpus_per_node: int = 1, critic_num_nodes: int = 1, critic_num_gpus_per_node: int = 1, ref_num_nodes: int = 1, ref_num_gpus_per_node: int = 1) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`colocate_all`	bool	When True, training and inference share the same GPUs.
`colocate_policy_ref`	bool	When colocate_all is False, True (default) still colocates policy and ref
`policy_num_nodes`	int
`policy_num_gpus_per_node`	int
`critic_num_nodes`	int
`critic_num_gpus_per_node`	int
`ref_num_nodes`	int
`ref_num_gpus_per_node`	int

Source code in skyrl/train/config/config.py:244-258

@dataclass
class PlacementConfig(BaseConfig):
    colocate_all: bool = True
    """When True, training and inference share the same GPUs."""
    colocate_policy_ref: bool = True
    """When colocate_all is False, True (default) still colocates policy and ref
    on the same GPUs (one shared placement group). Set this item to False to place
    policy and ref on separate GPUs (their own placement groups); needed when
    a large model's policy and ref shards can't both fit on one GPU."""
    policy_num_nodes: int = 1
    policy_num_gpus_per_node: int = 1
    critic_num_nodes: int = 1
    critic_num_gpus_per_node: int = 1
    ref_num_nodes: int = 1
    ref_num_gpus_per_node: int = 1

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `colocate_all`

colocate_all: bool = True

When True, training and inference share the same GPUs.

attr `colocate_policy_ref`

colocate_policy_ref: bool = True

When colocate_all is False, True (default) still colocates policy and ref on the same GPUs (one shared placement group). Set this item to False to place policy and ref on separate GPUs (their own placement groups); needed when a large model's policy and ref shards can't both fit on one GPU.

attr `policy_num_nodes`

policy_num_nodes: int = 1

attr `policy_num_gpus_per_node`

policy_num_gpus_per_node: int = 1

attr `critic_num_nodes`

critic_num_nodes: int = 1

attr `critic_num_gpus_per_node`

critic_num_gpus_per_node: int = 1

attr `ref_num_nodes`

ref_num_nodes: int = 1

attr `ref_num_gpus_per_node`

ref_num_gpus_per_node: int = 1

Policy & Algorithm

class `PolicyConfig`

PolicyConfig(model: ModelConfig = (lambda: copy.deepcopy(ModelConfig(path='Qwen/Qwen2.5-1.5B-Instruct')))(), optimizer_config: OptimizerConfig = OptimizerConfig(), fsdp_config: FSDPConfig = FSDPConfig(), sequence_parallel_size: int = 1, use_torch_compile: bool = False, record_memory: bool = False, megatron_config: MegatronConfig = MegatronConfig(), model_config_kwargs: dict = dict(), language_model_only: bool = False, inference_only_init: bool = False) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`model`	ModelConfig
`optimizer_config`	OptimizerConfig
`fsdp_config`	FSDPConfig
`sequence_parallel_size`	int
`use_torch_compile`	bool	Apply torch.compile to logits calculation.
`record_memory`	bool	Save memory snapshots to `{ckpt_path}/memory_snapshots/`.
`megatron_config`	MegatronConfig
`model_config_kwargs`	dict	Pass-through kwargs for the HuggingFace model config (FSDP backends).
`language_model_only`	bool	When True, skip vision encoder initialization for multimodal models (e.g. Qwen3.5).
`inference_only_init`	bool	When True, set up the policy worker for inference-only flows (forward + weight

Source code in skyrl/train/config/config.py:266-292

@dataclass
class PolicyConfig(BaseConfig):
    model: ModelConfig = field(default_factory=lambda: copy.deepcopy(ModelConfig(path="Qwen/Qwen2.5-1.5B-Instruct")))
    optimizer_config: OptimizerConfig = field(default_factory=OptimizerConfig)
    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
    sequence_parallel_size: int = 1
    use_torch_compile: bool = False
    """Apply torch.compile to logits calculation."""
    record_memory: bool = False
    """Save memory snapshots to ``{ckpt_path}/memory_snapshots/``.
    Visualize by dragging pickle files to https://docs.pytorch.org/memory_viz."""
    megatron_config: MegatronConfig = field(default_factory=MegatronConfig)
    model_config_kwargs: dict = field(default_factory=dict)
    """Pass-through kwargs for the HuggingFace model config (FSDP backends).
    For Megatron, use ``policy.megatron_config.transformer_config_kwargs`` instead."""
    language_model_only: bool = False
    """When True, skip vision encoder initialization for multimodal models (e.g. Qwen3.5).
    Loads only the language model backbone using AutoModelForCausalLM."""
    inference_only_init: bool = False
    """When True, set up the policy worker for inference-only flows (forward + weight
    sync, no train_step), skipping the training-only state that would otherwise OOM
    memory-constrained nodes (e.g. large MoE on 4xH100). NOT valid for actual training.
    Backend-specific behavior:
    - FSDP: initialize weights in bf16 instead of fp32 (skipping the fp32 master weights
      that mixed-precision training requires) and skip optimizer/LR-scheduler construction.
    - Megatron: skip optimizer/LR-scheduler construction (DistributedOptimizer eagerly
      materializes fp32 master + AdamW state on GPU)."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `model`

model: ModelConfig = field(default_factory=(lambda: copy.deepcopy(ModelConfig(path='Qwen/Qwen2.5-1.5B-Instruct'))))

attr `optimizer_config`

optimizer_config: OptimizerConfig = field(default_factory=OptimizerConfig)

attr `fsdp_config`

fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)

attr `sequence_parallel_size`

sequence_parallel_size: int = 1

attr `use_torch_compile`

use_torch_compile: bool = False

Apply torch.compile to logits calculation.

attr `record_memory`

record_memory: bool = False

Save memory snapshots to {ckpt_path}/memory_snapshots/. Visualize by dragging pickle files to https://docs.pytorch.org/memory_viz.

attr `megatron_config`

megatron_config: MegatronConfig = field(default_factory=MegatronConfig)

attr `model_config_kwargs`

model_config_kwargs: dict = field(default_factory=dict)

Pass-through kwargs for the HuggingFace model config (FSDP backends). For Megatron, use policy.megatron_config.transformer_config_kwargs instead.

attr `language_model_only`

language_model_only: bool = False

When True, skip vision encoder initialization for multimodal models (e.g. Qwen3.5). Loads only the language model backbone using AutoModelForCausalLM.

attr `inference_only_init`

inference_only_init: bool = False

When True, set up the policy worker for inference-only flows (forward + weight sync, no train_step), skipping the training-only state that would otherwise OOM memory-constrained nodes (e.g. large MoE on 4xH100). NOT valid for actual training. Backend-specific behavior:

FSDP: initialize weights in bf16 instead of fp32 (skipping the fp32 master weights that mixed-precision training requires) and skip optimizer/LR-scheduler construction.
Megatron: skip optimizer/LR-scheduler construction (DistributedOptimizer eagerly materializes fp32 master + AdamW state on GPU).

class `CriticConfig`

CriticConfig(model: ModelConfig = ModelConfig(), optimizer_config: OptimizerConfig = (lambda: OptimizerConfig(lr=5e-06))(), fsdp_config: FSDPConfig = FSDPConfig(), sequence_parallel_size: int = 1, model_config_kwargs: dict = dict()) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`model`	ModelConfig
`optimizer_config`	OptimizerConfig
`fsdp_config`	FSDPConfig
`sequence_parallel_size`	int
`model_config_kwargs`	dict

Source code in skyrl/train/config/config.py:295-301

@dataclass
class CriticConfig(BaseConfig):
    model: ModelConfig = field(default_factory=ModelConfig)
    optimizer_config: OptimizerConfig = field(default_factory=lambda: OptimizerConfig(lr=5e-6))
    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
    sequence_parallel_size: int = 1
    model_config_kwargs: dict = field(default_factory=dict)

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `model`

model: ModelConfig = field(default_factory=ModelConfig)

attr `optimizer_config`

optimizer_config: OptimizerConfig = field(default_factory=(lambda: OptimizerConfig(lr=5e-06)))

attr `fsdp_config`

fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)

attr `sequence_parallel_size`

sequence_parallel_size: int = 1

attr `model_config_kwargs`

model_config_kwargs: dict = field(default_factory=dict)

class `RefConfig`

RefConfig(model: ModelConfig = ModelConfig(), sequence_parallel_size: int = 1, fsdp_config: FSDPConfig = FSDPConfig(), megatron_config: MegatronConfig = MegatronConfig(), model_config_kwargs: dict = dict(), language_model_only: bool = False) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`model`	ModelConfig
`sequence_parallel_size`	int
`fsdp_config`	FSDPConfig
`megatron_config`	MegatronConfig
`model_config_kwargs`	dict
`language_model_only`	bool	When True, skip vision encoder initialization for multimodal models (e.g. Qwen3.5).

Source code in skyrl/train/config/config.py:305-314

@dataclass
class RefConfig(BaseConfig):
    model: ModelConfig = field(default_factory=ModelConfig)
    sequence_parallel_size: int = 1
    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
    megatron_config: MegatronConfig = field(default_factory=MegatronConfig)
    model_config_kwargs: dict = field(default_factory=dict)
    language_model_only: bool = False
    """When True, skip vision encoder initialization for multimodal models (e.g. Qwen3.5).
    Loads only the language model backbone using AutoModelForCausalLM."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `model`

model: ModelConfig = field(default_factory=ModelConfig)

attr `sequence_parallel_size`

sequence_parallel_size: int = 1

attr `fsdp_config`

fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)

attr `megatron_config`

megatron_config: MegatronConfig = field(default_factory=MegatronConfig)

attr `model_config_kwargs`

model_config_kwargs: dict = field(default_factory=dict)

attr `language_model_only`

language_model_only: bool = False

When True, skip vision encoder initialization for multimodal models (e.g. Qwen3.5). Loads only the language model backbone using AutoModelForCausalLM.

class `AlgorithmConfig`

AlgorithmConfig(advantage_estimator: str = 'grpo', kl_ctrl: KLCtrlConfig = KLCtrlConfig(), kl_estimator_type: str = 'k3', use_kl_in_reward: bool = False, use_kl_loss: bool = True, kl_loss_coef: float = 0.001, use_entropy_loss: bool = False, entropy_loss_coef: float = 0.01, temperature: Optional[float] = None, advantage_batch_normalize: bool = False, value_head_prefix: str = 'value_head', policy_loss_type: str = 'regular', loss_reduction: str = 'token_mean', grpo_norm_by_std: bool = True, zero_variance_filter: bool = False, lambd: float = 1.0, gamma: float = 1.0, eps_clip_low: float = 0.2, eps_clip_high: float = 0.2, clip_ratio_c: float = 3.0, tis_imp_ratio_cap: float = -1.0, use_tis: bool = False, off_policy_correction: OffPolicyCorrectionConfig = OffPolicyCorrectionConfig(), sapo: SAPOConfig = SAPOConfig(), value_clip: float = 0.2, dynamic_sampling: DynamicSamplingConfig = DynamicSamplingConfig(), clip_cov: ClipCovConfig = ClipCovConfig(), kl_cov: KLCovConfig = KLCovConfig(), cispo: CISPOConfig = CISPOConfig(), max_seq_len: Optional[int] = None) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`advantage_estimator`	str	`"grpo"`, `"gae"`, `"rloo"`, `"reinforce++"`, or custom via `AdvantageEstimatorRegistry`.
`kl_ctrl`	KLCtrlConfig	Only used when `use_kl_in_reward=True` (not applied when `use_kl_loss=True`).
`kl_estimator_type`	str	`"k1"`, `"k2"`, `"k3"`, `"abs"`. See http://joschu.net/blog/kl-approx.html.
`use_kl_in_reward`	bool	Apply KL penalty to rewards. Mutually exclusive with `use_kl_loss`.
`use_kl_loss`	bool	Apply KL loss in the policy model. Mutually exclusive with `use_kl_in_reward`.
`kl_loss_coef`	float
`use_entropy_loss`	bool
`entropy_loss_coef`	float
`temperature`	Optional[float]	Temperature for scaling logits in policy loss computation.
`advantage_batch_normalize`	bool
`value_head_prefix`	str
`policy_loss_type`	str	`"regular"`, `"dual_clip"`, `"gspo"`, `"clip_cov"`, `"kl_cov"`, or custom via `PolicyLossRegistry`.
`loss_reduction`	str	`"token_mean"`, `"sequence_mean"`, or `"seq_mean_token_sum_norm"`. `max_seq_len` must be set explicitly for `"seq_mean_token_sum_norm"`.
`grpo_norm_by_std`	bool
`zero_variance_filter`	bool	Loss-mask prompts with zero-variance rewards. Only applicable when rewards are response-level.
`lambd`	float
`gamma`	float
`eps_clip_low`	float
`eps_clip_high`	float
`clip_ratio_c`	float	Dual-clip parameter.
`tis_imp_ratio_cap`	float	Deprecated: use `off_policy_correction.tis_ratio_type="token"` and `token_tis_ratio_clip_high` instead.
`use_tis`	bool	Deprecated: use `off_policy_correction` instead.
`off_policy_correction`	OffPolicyCorrectionConfig
`sapo`	SAPOConfig
`value_clip`	float
`dynamic_sampling`	DynamicSamplingConfig
`clip_cov`	ClipCovConfig	Only used when `policy_loss_type="clip_cov"`.
`kl_cov`	KLCovConfig	Only used when `policy_loss_type="kl_cov"`.
`cispo`	CISPOConfig	Only used when `policy_loss_type="cispo"`.
`max_seq_len`	Optional[int]	Used for `seq_mean_token_sum_norm` loss reduction.

Source code in skyrl/train/config/config.py:405-458

@dataclass
class AlgorithmConfig(BaseConfig):
    advantage_estimator: str = "grpo"
    """``"grpo"``, ``"gae"``, ``"rloo"``, ``"reinforce++"``, or custom via ``AdvantageEstimatorRegistry``."""
    kl_ctrl: KLCtrlConfig = field(default_factory=KLCtrlConfig)
    """Only used when ``use_kl_in_reward=True`` (not applied when ``use_kl_loss=True``).
    Uses ``kl_loss_coef`` as the initial KL coefficient."""
    kl_estimator_type: str = "k3"
    """``"k1"``, ``"k2"``, ``"k3"``, ``"abs"``. See http://joschu.net/blog/kl-approx.html."""
    use_kl_in_reward: bool = False
    """Apply KL penalty to rewards. Mutually exclusive with ``use_kl_loss``."""
    use_kl_loss: bool = True
    """Apply KL loss in the policy model. Mutually exclusive with ``use_kl_in_reward``."""
    kl_loss_coef: float = 0.001
    use_entropy_loss: bool = False
    entropy_loss_coef: float = 0.01
    temperature: Optional[float] = None
    """Temperature for scaling logits in policy loss computation.
    If ``None``, will be set to the temperature provided by ``generator.sampling_params.temperature`` during config validation.
    
    NOTE: When using HTTP endpoints directly, make sure to set this value to the temperature used during generation
    """
    advantage_batch_normalize: bool = False
    value_head_prefix: str = "value_head"
    policy_loss_type: str = "regular"
    """``"regular"``, ``"dual_clip"``, ``"gspo"``, ``"clip_cov"``, ``"kl_cov"``, or custom via ``PolicyLossRegistry``."""
    loss_reduction: str = "token_mean"
    """``"token_mean"``, ``"sequence_mean"``, or ``"seq_mean_token_sum_norm"``. ``max_seq_len`` must be set explicitly for ``"seq_mean_token_sum_norm"``."""
    grpo_norm_by_std: bool = True
    zero_variance_filter: bool = False
    """Loss-mask prompts with zero-variance rewards. Only applicable when rewards are response-level."""
    lambd: float = 1.0
    gamma: float = 1.0
    eps_clip_low: float = 0.2
    eps_clip_high: float = 0.2
    clip_ratio_c: float = 3.0
    """Dual-clip parameter."""
    tis_imp_ratio_cap: float = -1.0
    """Deprecated: use ``off_policy_correction.tis_ratio_type="token"`` and ``token_tis_ratio_clip_high`` instead."""
    use_tis: bool = False
    """Deprecated: use ``off_policy_correction`` instead."""
    off_policy_correction: OffPolicyCorrectionConfig = field(default_factory=OffPolicyCorrectionConfig)
    sapo: SAPOConfig = field(default_factory=SAPOConfig)
    value_clip: float = 0.2
    dynamic_sampling: DynamicSamplingConfig = field(default_factory=DynamicSamplingConfig)
    clip_cov: ClipCovConfig = field(default_factory=ClipCovConfig)
    """Only used when ``policy_loss_type="clip_cov"``."""
    kl_cov: KLCovConfig = field(default_factory=KLCovConfig)
    """Only used when ``policy_loss_type="kl_cov"``."""
    cispo: CISPOConfig = field(default_factory=CISPOConfig)
    """Only used when ``policy_loss_type="cispo"``."""
    max_seq_len: Optional[int] = None
    """Used for ``seq_mean_token_sum_norm`` loss reduction.
    Must be set explicitly for that reduction mode; otherwise can remain ``None``."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `advantage_estimator`

advantage_estimator: str = 'grpo'

"grpo", "gae", "rloo", "reinforce++", or custom via AdvantageEstimatorRegistry.

attr `kl_ctrl`

kl_ctrl: KLCtrlConfig = field(default_factory=KLCtrlConfig)

Only used when use_kl_in_reward=True (not applied when use_kl_loss=True). Uses kl_loss_coef as the initial KL coefficient.

attr `kl_estimator_type`

kl_estimator_type: str = 'k3'

"k1", "k2", "k3", "abs". See http://joschu.net/blog/kl-approx.html.

attr `use_kl_in_reward`

use_kl_in_reward: bool = False

Apply KL penalty to rewards. Mutually exclusive with use_kl_loss.

attr `use_kl_loss`

use_kl_loss: bool = True

Apply KL loss in the policy model. Mutually exclusive with use_kl_in_reward.

attr `kl_loss_coef`

kl_loss_coef: float = 0.001

attr `use_entropy_loss`

use_entropy_loss: bool = False

attr `entropy_loss_coef`

entropy_loss_coef: float = 0.01

attr `temperature`

temperature: Optional[float] = None

Temperature for scaling logits in policy loss computation. If None, will be set to the temperature provided by generator.sampling_params.temperature during config validation.

NOTE: When using HTTP endpoints directly, make sure to set this value to the temperature used during generation

attr `advantage_batch_normalize`

advantage_batch_normalize: bool = False

attr `value_head_prefix`

value_head_prefix: str = 'value_head'

attr `policy_loss_type`

policy_loss_type: str = 'regular'

"regular", "dual_clip", "gspo", "clip_cov", "kl_cov", or custom via PolicyLossRegistry.

attr `loss_reduction`

loss_reduction: str = 'token_mean'

"token_mean", "sequence_mean", or "seq_mean_token_sum_norm". max_seq_len must be set explicitly for "seq_mean_token_sum_norm".

attr `grpo_norm_by_std`

grpo_norm_by_std: bool = True

attr `zero_variance_filter`

zero_variance_filter: bool = False

Loss-mask prompts with zero-variance rewards. Only applicable when rewards are response-level.

attr `lambd`

lambd: float = 1.0

attr `gamma`

gamma: float = 1.0

attr `eps_clip_low`

eps_clip_low: float = 0.2

attr `eps_clip_high`

eps_clip_high: float = 0.2

attr `clip_ratio_c`

clip_ratio_c: float = 3.0

Dual-clip parameter.

attr `tis_imp_ratio_cap`

tis_imp_ratio_cap: float = -1.0

Deprecated: use off_policy_correction.tis_ratio_type="token" and token_tis_ratio_clip_high instead.

attr `use_tis`

use_tis: bool = False

Deprecated: use off_policy_correction instead.

attr `off_policy_correction`

off_policy_correction: OffPolicyCorrectionConfig = field(default_factory=OffPolicyCorrectionConfig)

attr `sapo`

sapo: SAPOConfig = field(default_factory=SAPOConfig)

attr `value_clip`

value_clip: float = 0.2

attr `dynamic_sampling`

dynamic_sampling: DynamicSamplingConfig = field(default_factory=DynamicSamplingConfig)

attr `clip_cov`

clip_cov: ClipCovConfig = field(default_factory=ClipCovConfig)

Only used when policy_loss_type="clip_cov".

attr `kl_cov`

kl_cov: KLCovConfig = field(default_factory=KLCovConfig)

Only used when policy_loss_type="kl_cov".

attr `cispo`

cispo: CISPOConfig = field(default_factory=CISPOConfig)

Only used when policy_loss_type="cispo".

attr `max_seq_len`

max_seq_len: Optional[int] = None

Used for seq_mean_token_sum_norm loss reduction. Must be set explicitly for that reduction mode; otherwise can remain None.

class `KLCtrlConfig`

KLCtrlConfig(type: str = 'fixed', kl_target: float = 0.1, horizon: int = 10000) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`type`	str	`"fixed"` or `"adaptive"`.
`kl_target`	float	Target KL divergence for the adaptive KL controller.
`horizon`	int	Controls the update rate of the adaptive KL controller.

Source code in skyrl/train/config/config.py:322-330

@dataclass
class KLCtrlConfig(BaseConfig):

    type: str = "fixed"
    """``"fixed"`` or ``"adaptive"``."""
    kl_target: float = 0.1
    """Target KL divergence for the adaptive KL controller."""
    horizon: int = 10000
    """Controls the update rate of the adaptive KL controller."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `type`

type: str = 'fixed'

"fixed" or "adaptive".

attr `kl_target`

kl_target: float = 0.1

Target KL divergence for the adaptive KL controller.

attr `horizon`

horizon: int = 10000

Controls the update rate of the adaptive KL controller.

Algorithm Extensions

class `SAPOConfig`

SAPOConfig(tau_pos: float = 1.0, tau_neg: float = 1.05) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`tau_pos`	float
`tau_neg`	float

Source code in skyrl/train/config/config.py:333-336

@dataclass
class SAPOConfig(BaseConfig):
    tau_pos: float = 1.0
    tau_neg: float = 1.05

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `tau_pos`

tau_pos: float = 1.0

attr `tau_neg`

tau_neg: float = 1.05

class `DynamicSamplingConfig`

DynamicSamplingConfig(type: Optional[str] = None, max_sample_batches: int = 30, min_replace_ratio: float = 0.3) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`type`	Optional[str]	`"filter"`, `"replace"`, or `None`.
`max_sample_batches`	int	Sample at most this many batches before stopping. `-1` to sample forever.
`min_replace_ratio`	float	Minimum proportion of good samples to replace bad samples. Only used with `"replace"` strategy.

Source code in skyrl/train/config/config.py:339-346

@dataclass
class DynamicSamplingConfig(BaseConfig):
    type: Optional[str] = None
    """``"filter"``, ``"replace"``, or ``None``."""
    max_sample_batches: int = 30
    """Sample at most this many batches before stopping. ``-1`` to sample forever."""
    min_replace_ratio: float = 0.3
    """Minimum proportion of good samples to replace bad samples. Only used with ``"replace"`` strategy."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `type`

type: Optional[str] = None

"filter", "replace", or None.

attr `max_sample_batches`

max_sample_batches: int = 30

Sample at most this many batches before stopping. -1 to sample forever.

attr `min_replace_ratio`

min_replace_ratio: float = 0.3

Minimum proportion of good samples to replace bad samples. Only used with "replace" strategy.

class `ClipCovConfig`

ClipCovConfig(clip_ratio: float = 0.0002, clip_cov_lb: float = 1.0, clip_cov_ub: float = 5.0) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`clip_ratio`	float	Fraction of tokens to clip based on covariance.
`clip_cov_lb`	float
`clip_cov_ub`	float

Source code in skyrl/train/config/config.py:349-355

@dataclass
class ClipCovConfig(BaseConfig):

    clip_ratio: float = 0.0002
    """Fraction of tokens to clip based on covariance."""
    clip_cov_lb: float = 1.0
    clip_cov_ub: float = 5.0

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `clip_ratio`

clip_ratio: float = 0.0002

Fraction of tokens to clip based on covariance.

attr `clip_cov_lb`

clip_cov_lb: float = 1.0

attr `clip_cov_ub`

clip_cov_ub: float = 5.0

class `KLCovConfig`

KLCovConfig(kl_cov_frac: float = 0.2, ppo_kl_coef: float = 1.0) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`kl_cov_frac`	float	Fraction of tokens to apply KL regularization to.
`ppo_kl_coef`	float

Source code in skyrl/train/config/config.py:358-363

@dataclass
class KLCovConfig(BaseConfig):

    kl_cov_frac: float = 0.2
    """Fraction of tokens to apply KL regularization to."""
    ppo_kl_coef: float = 1.0

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `kl_cov_frac`

kl_cov_frac: float = 0.2

Fraction of tokens to apply KL regularization to.

attr `ppo_kl_coef`

ppo_kl_coef: float = 1.0

class `CISPOConfig`

CISPOConfig(cispo_eps_clip_low: float = 0.0, cispo_eps_clip_high: float = 5.0) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`cispo_eps_clip_low`	float	Offset for lower bound of importance sampling ratio clipping (as opposed to PPO token update clipping).
`cispo_eps_clip_high`	float	Offset for upper bound of importance sampling ratio clipping (as opposed to PPO token update clipping).

Source code in skyrl/train/config/config.py:366-372

@dataclass
class CISPOConfig(BaseConfig):

    cispo_eps_clip_low: float = 0.0
    """Offset for lower bound of importance sampling ratio clipping (as opposed to PPO token update clipping)."""
    cispo_eps_clip_high: float = 5.0
    """Offset for upper bound of importance sampling ratio clipping (as opposed to PPO token update clipping)."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `cispo_eps_clip_low`

cispo_eps_clip_low: float = 0.0

Offset for lower bound of importance sampling ratio clipping (as opposed to PPO token update clipping).

attr `cispo_eps_clip_high`

cispo_eps_clip_high: float = 5.0

Offset for upper bound of importance sampling ratio clipping (as opposed to PPO token update clipping).

class `OffPolicyCorrectionConfig`

OffPolicyCorrectionConfig(tis_ratio_type: Optional[str] = None, token_tis_ratio_clip_high: float = 2.0, sequence_tis_ratio_clip_high: float = 5.0, sequence_mask_metric: Optional[str] = None, geo_mask_high: float = 1.01, geo_mask_low: float = 0.99, product_mask_high: float = 2.0, product_mask_low: float = 0.5, outlier_token_is_threshold_low: Optional[float] = None, outlier_token_is_threshold_high: Optional[float] = None, token_mask_is_threshold_low: Optional[float] = None, token_mask_is_threshold_high: Optional[float] = None) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`tis_ratio_type`	Optional[str]	Importance sampling ratio type for PPO loss correction: `None`, `"token"`, or `"sequence"`.
`token_tis_ratio_clip_high`	float	Used when `tis_ratio_type="token"`. Recommended range: 1.5--5.0.
`sequence_tis_ratio_clip_high`	float	Used when `tis_ratio_type="sequence"`. Recommended range: 2.0--10.0.
`sequence_mask_metric`	Optional[str]	Method for masking sequences with cumulative IS ratios outside cap: `None`, `"product"`, or `"geometric"`.
`geo_mask_high`	float	Used when `sequence_mask_metric="geometric"`. Recommended ~0.99--1.01; MoE models may need a wider range.
`geo_mask_low`	float	Used when `sequence_mask_metric="geometric"`.
`product_mask_high`	float	Used when `sequence_mask_metric="product"`. Recommended ~0.5--2.0.
`product_mask_low`	float	Used when `sequence_mask_metric="product"`.
`outlier_token_is_threshold_low`	Optional[float]	Set to mask sequences with any token IS ratio below this threshold. Suggested: 1e-4. `None` to disable.
`outlier_token_is_threshold_high`	Optional[float]	Set to mask sequences with any token IS ratio above this threshold. Suggested: 100. `None` to disable.
`token_mask_is_threshold_low`	Optional[float]	Set to mask per-token when IS ratio < `token_mask_is_threshold_low`. `None` to disable.
`token_mask_is_threshold_high`	Optional[float]	Set to mask per-token when IS ratio > `token_mask_is_threshold_high`. `None` to disable.

Source code in skyrl/train/config/config.py:376-402

@dataclass
class OffPolicyCorrectionConfig(BaseConfig):
    tis_ratio_type: Optional[str] = None
    """Importance sampling ratio type for PPO loss correction: ``None``, ``"token"``, or ``"sequence"``.
    The ratio is ``exp(logprobs_policy_old - logprobs_rollout_policy)``."""
    token_tis_ratio_clip_high: float = 2.0
    """Used when ``tis_ratio_type="token"``. Recommended range: 1.5--5.0."""
    sequence_tis_ratio_clip_high: float = 5.0
    """Used when ``tis_ratio_type="sequence"``. Recommended range: 2.0--10.0."""
    sequence_mask_metric: Optional[str] = None
    """Method for masking sequences with cumulative IS ratios outside cap: ``None``, ``"product"``, or ``"geometric"``."""
    geo_mask_high: float = 1.01
    """Used when ``sequence_mask_metric="geometric"``. Recommended ~0.99--1.01; MoE models may need a wider range."""
    geo_mask_low: float = 0.99
    """Used when ``sequence_mask_metric="geometric"``."""
    product_mask_high: float = 2.0
    """Used when ``sequence_mask_metric="product"``. Recommended ~0.5--2.0."""
    product_mask_low: float = 0.5
    """Used when ``sequence_mask_metric="product"``."""
    outlier_token_is_threshold_low: Optional[float] = None
    """Set to mask sequences with any token IS ratio below this threshold. Suggested: 1e-4. ``None`` to disable."""
    outlier_token_is_threshold_high: Optional[float] = None
    """Set to mask sequences with any token IS ratio above this threshold. Suggested: 100. ``None`` to disable."""
    token_mask_is_threshold_low: Optional[float] = None
    """Set to mask per-token when IS ratio < `token_mask_is_threshold_low`. ``None`` to disable."""
    token_mask_is_threshold_high: Optional[float] = None
    """Set to mask per-token when IS ratio > `token_mask_is_threshold_high`. ``None`` to disable."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `tis_ratio_type`

tis_ratio_type: Optional[str] = None

Importance sampling ratio type for PPO loss correction: None, "token", or "sequence". The ratio is exp(logprobs_policy_old - logprobs_rollout_policy).

attr `token_tis_ratio_clip_high`

token_tis_ratio_clip_high: float = 2.0

Used when tis_ratio_type="token". Recommended range: 1.5--5.0.

attr `sequence_tis_ratio_clip_high`

sequence_tis_ratio_clip_high: float = 5.0

Used when tis_ratio_type="sequence". Recommended range: 2.0--10.0.

attr `sequence_mask_metric`

sequence_mask_metric: Optional[str] = None

Method for masking sequences with cumulative IS ratios outside cap: None, "product", or "geometric".

attr `geo_mask_high`

geo_mask_high: float = 1.01

Used when sequence_mask_metric="geometric". Recommended ~0.99--1.01; MoE models may need a wider range.

attr `geo_mask_low`

geo_mask_low: float = 0.99

Used when sequence_mask_metric="geometric".

attr `product_mask_high`

product_mask_high: float = 2.0

Used when sequence_mask_metric="product". Recommended ~0.5--2.0.

attr `product_mask_low`

product_mask_low: float = 0.5

Used when sequence_mask_metric="product".

attr `outlier_token_is_threshold_low`

outlier_token_is_threshold_low: Optional[float] = None

Set to mask sequences with any token IS ratio below this threshold. Suggested: 1e-4. None to disable.

attr `outlier_token_is_threshold_high`

outlier_token_is_threshold_high: Optional[float] = None

Set to mask sequences with any token IS ratio above this threshold. Suggested: 100. None to disable.

attr `token_mask_is_threshold_low`

token_mask_is_threshold_low: Optional[float] = None

Set to mask per-token when IS ratio < token_mask_is_threshold_low. None to disable.

attr `token_mask_is_threshold_high`

token_mask_is_threshold_high: Optional[float] = None

Set to mask per-token when IS ratio > token_mask_is_threshold_high. None to disable.

class `FullyAsyncConfig`

FullyAsyncConfig(max_staleness_steps: int = 4, num_parallel_generation_workers: int = 768) -> None

Bases: BaseConfig

Knobs for fully async training. See https://docs.skyrl.ai/docs/tutorials/fully_async#step-2-config-knobs-to-tune-for-fully-async-training.

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`max_staleness_steps`	int	Maximum off-policy steps allowed. If a trajectory group is scheduled at step i and trained at step j,
`num_parallel_generation_workers`	int	Number of generation workers to spawn. Should be >= `policy_mini_batch_size` and

Source code in skyrl/train/config/config.py:466-476

@dataclass
class FullyAsyncConfig(BaseConfig):
    """Knobs for fully async training.
    See https://docs.skyrl.ai/docs/tutorials/fully_async#step-2-config-knobs-to-tune-for-fully-async-training."""

    max_staleness_steps: int = 4
    """Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*,
    then ``j - i <= max_staleness_steps``. Larger values increase throughput but also off-policy-ness."""
    num_parallel_generation_workers: int = 768
    """Number of generation workers to spawn. Should be >= ``policy_mini_batch_size`` and
    <= ``policy_mini_batch_size * (max_staleness_steps + 1)``."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `max_staleness_steps`

max_staleness_steps: int = 4

Maximum off-policy steps allowed. If a trajectory group is scheduled at step i and trained at step j, then j - i <= max_staleness_steps. Larger values increase throughput but also off-policy-ness.

attr `num_parallel_generation_workers`

num_parallel_generation_workers: int = 768

Number of generation workers to spawn. Should be >= policy_mini_batch_size and <= policy_mini_batch_size * (max_staleness_steps + 1).

Inference & Generation

class `SamplingParams`

SamplingParams(max_generate_length: int = 1024, repetition_penalty: float = 1.0, temperature: float = 1.0, top_p: float = 1.0, min_p: float = 0.0, top_k: int = -1, logprobs: Optional[int] = 1, stop: Optional[List[str]] = None, additional_kwargs: Optional[Dict[str, Any]] = None) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`max_generate_length`	int
`repetition_penalty`	float
`temperature`	float
`top_p`	float
`min_p`	float
`top_k`	int
`logprobs`	Optional[int]
`stop`	Optional[List[str]]
`additional_kwargs`	Optional[Dict[str, Any]]

Source code in skyrl/train/config/config.py:484-494

@dataclass
class SamplingParams(BaseConfig):
    max_generate_length: int = 1024
    repetition_penalty: float = 1.0
    temperature: float = 1.0
    top_p: float = 1.0
    min_p: float = 0.0
    top_k: int = -1
    logprobs: Optional[int] = 1
    stop: Optional[List[str]] = None
    additional_kwargs: Optional[Dict[str, Any]] = None

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `max_generate_length`

max_generate_length: int = 1024

attr `repetition_penalty`

repetition_penalty: float = 1.0

attr `temperature`

temperature: float = 1.0

attr `top_p`

top_p: float = 1.0

attr `min_p`

min_p: float = 0.0

attr `top_k`

top_k: int = -1

attr `logprobs`

logprobs: Optional[int] = 1

attr `stop`

stop: Optional[List[str]] = None

attr `additional_kwargs`

additional_kwargs: Optional[Dict[str, Any]] = None

class `ChatTemplateConfig`

ChatTemplateConfig(source: str = 'name', name_or_path: Optional[str] = None) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`source`	str
`name_or_path`	Optional[str]

Source code in skyrl/train/config/config.py:497-500

@dataclass
class ChatTemplateConfig(BaseConfig):
    source: str = "name"
    name_or_path: Optional[str] = None

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `source`

source: str = 'name'

attr `name_or_path`

name_or_path: Optional[str] = None

class `InferenceEngineConfig`

InferenceEngineConfig(model_dtype: str = 'bfloat16', run_engines_locally: bool = True, num_engines: int = 1, backend: str = 'vllm', weight_sync_backend: str = 'nccl', weight_transfer_threshold_cuda_ipc_GB: float = 1.0, tensor_parallel_size: int = 1, pipeline_parallel_size: int = 1, expert_parallel_size: int = 1, data_parallel_size: int = 1, async_engine: bool = True, vllm_v1_disable_multiproc: bool = True, enable_prefix_caching: bool = True, enable_chunked_prefill: bool = True, enable_return_routed_experts: bool = False, max_num_batched_tokens: int = 8192, enforce_eager: bool = True, fully_sharded_loras: bool = False, enable_ray_prometheus_stats: bool = True, gpu_memory_utilization: float = 0.8, max_num_seqs: int = 1024, remote_urls: List[str] = (lambda: [])(), enable_http_endpoint: bool = False, http_endpoint_host: str = '127.0.0.1', http_endpoint_port: int = 8000, served_model_name: Optional[str] = None, distributed_executor_backend: str = 'ray', language_model_only: bool = False, engine_init_kwargs: Dict[str, Any] = dict(), override_existing_update_group: str = 'auto', external_proxy_url: Optional[str] = None, external_server_urls: Optional[List[str]] = None, enable_pd: bool = False, num_prefill: int = 0, router_init_kwargs: Dict[str, Any] = dict()) -> None

Bases: BaseConfig

Configuration for inference engine instantiation and management.

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`model_dtype`	str	Should match the dtype used by the inference engine.
`run_engines_locally`	bool
`num_engines`	int
`backend`	str	`"vllm"`.
`weight_sync_backend`	str
`weight_transfer_threshold_cuda_ipc_GB`	float	When using `cuda_ipc`, send weights in batches of this size (GB).
`tensor_parallel_size`	int
`pipeline_parallel_size`	int
`expert_parallel_size`	int
`data_parallel_size`	int
`async_engine`	bool
`vllm_v1_disable_multiproc`	bool	Sets `VLLM_ENABLE_V1_MULTIPROCESSING=0` for reproducibility.
`enable_prefix_caching`	bool
`enable_chunked_prefill`	bool
`enable_return_routed_experts`	bool
`max_num_batched_tokens`	int
`enforce_eager`	bool	Disable CUDA graphs for stability. Set to `False` for higher performance,
`fully_sharded_loras`	bool
`enable_ray_prometheus_stats`	bool	Enable Ray Prometheus stats logger for inference engine metrics (vLLM v1 only).
`gpu_memory_utilization`	float
`max_num_seqs`	int
`remote_urls`	List[str]
`enable_http_endpoint`	bool	When `True`, launch an OpenAI-compatible HTTP endpoint for the inference engine client so that generators can send requests to this server instead of using `.generate()` Python calls.
`http_endpoint_host`	str
`http_endpoint_port`	int
`served_model_name`	Optional[str]	Model name for HTTP endpoint validation. If set, must be used in the `model` field of
`distributed_executor_backend`	str	Distributed executor backend for vLLM. Set to `"ray"` to use the Ray backend
`language_model_only`	bool	When True, pass `language_model_only=True` to the vLLM engine so that
`engine_init_kwargs`	Dict[str, Any]	Pass-through kwargs for the vLLM engine. Names must match the engine's args.
`override_existing_update_group`	str	`"auto"`, `"enable"`, or `"disable"`.
`external_proxy_url`	Optional[str]	Data-plane URL (load-balanced router) for the new inference layer.
`external_server_urls`	Optional[List[str]]	Control-plane URLs (direct backend access) for the new inference layer.
`enable_pd`	bool	Enable prefill-decode disaggregation. Requires `num_prefill > 0` and `num_engines >= 2`.
`num_prefill`	int	Number of prefill engines when `enable_pd=True`. Decode engines = `num_engines - num_prefill`
`router_init_kwargs`	Dict[str, Any]	Pass-through kwargs applied to `RouterArgs` for the vllm-router.

Source code in skyrl/train/config/config.py:508-574

@dataclass
class InferenceEngineConfig(BaseConfig):
    """Configuration for inference engine instantiation and management."""

    model_dtype: str = "bfloat16"
    """Should match the dtype used by the inference engine."""
    run_engines_locally: bool = True
    num_engines: int = 1
    backend: str = "vllm"
    """``"vllm"``."""
    weight_sync_backend: str = "nccl"
    weight_transfer_threshold_cuda_ipc_GB: float = 1.0
    """When using ``cuda_ipc``, send weights in batches of this size (GB)."""
    tensor_parallel_size: int = 1
    pipeline_parallel_size: int = 1
    expert_parallel_size: int = 1
    data_parallel_size: int = 1
    async_engine: bool = True
    vllm_v1_disable_multiproc: bool = True
    """Sets ``VLLM_ENABLE_V1_MULTIPROCESSING=0`` for reproducibility."""
    enable_prefix_caching: bool = True
    enable_chunked_prefill: bool = True
    enable_return_routed_experts: bool = False
    max_num_batched_tokens: int = 8192
    enforce_eager: bool = True
    """Disable CUDA graphs for stability. Set to ``False`` for higher performance,
    but this may affect convergence for long-running or long-context training jobs."""
    fully_sharded_loras: bool = False
    enable_ray_prometheus_stats: bool = True
    """Enable Ray Prometheus stats logger for inference engine metrics (vLLM v1 only)."""
    gpu_memory_utilization: float = 0.8
    max_num_seqs: int = 1024
    remote_urls: List[str] = field(default_factory=lambda: [])
    enable_http_endpoint: bool = False
    """When ``True``, launch an OpenAI-compatible HTTP endpoint for the inference engine client so that generators can send requests to this server instead of using ``.generate()`` Python calls.
    
    NOTE: When using HTTP endpoints directly, make sure to set ``trainer.algorithm.temperature`` to the temperature used during generation
    """
    http_endpoint_host: str = "127.0.0.1"
    http_endpoint_port: int = 8000
    served_model_name: Optional[str] = None
    """Model name for HTTP endpoint validation. If set, must be used in the ``model`` field of
    ``/chat/completions`` requests instead of the model path. If ``None``, the model path is used."""
    distributed_executor_backend: str = "ray"
    """Distributed executor backend for vLLM. Set to ``"ray"`` to use the Ray backend
    or ``"mp"`` to use the multiprocessing backend (single-node serving only). Per-engine 
    placement groups are created when ``"mp"`` is used."""
    language_model_only: bool = False
    """When True, pass ``language_model_only=True`` to the vLLM engine so that
    multimodal models (e.g. Qwen3.5) skip vision encoder initialization."""
    engine_init_kwargs: Dict[str, Any] = field(default_factory=dict)
    """Pass-through kwargs for the vLLM engine. Names must match the engine's args."""
    override_existing_update_group: str = "auto"
    """``"auto"``, ``"enable"``, or ``"disable"``."""
    external_proxy_url: Optional[str] = None
    """Data-plane URL (load-balanced router) for the new inference layer."""
    external_server_urls: Optional[List[str]] = None
    """Control-plane URLs (direct backend access) for the new inference layer."""
    enable_pd: bool = False
    """Enable prefill-decode disaggregation. Requires ``num_prefill > 0`` and ``num_engines >= 2``."""
    num_prefill: int = 0
    """Number of prefill engines when ``enable_pd=True``. Decode engines = ``num_engines - num_prefill``

    NOTE: SkyRL counts data parallel workers separately, so the total number of prefill workers will be ``data_parallel_size * num_prefill``."""
    router_init_kwargs: Dict[str, Any] = field(default_factory=dict)
    """Pass-through kwargs applied to ``RouterArgs`` for the vllm-router.
    Names must match ``vllm_router.RouterArgs`` fields (e.g. ``policy``, ``request_timeout_secs``)."""

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `model_dtype`

model_dtype: str = 'bfloat16'

Should match the dtype used by the inference engine.

attr `run_engines_locally`

run_engines_locally: bool = True

attr `num_engines`

num_engines: int = 1

attr `backend`

backend: str = 'vllm'

"vllm".

attr `weight_sync_backend`

weight_sync_backend: str = 'nccl'

attr `weight_transfer_threshold_cuda_ipc_GB`

weight_transfer_threshold_cuda_ipc_GB: float = 1.0

When using cuda_ipc, send weights in batches of this size (GB).

attr `tensor_parallel_size`

tensor_parallel_size: int = 1

attr `pipeline_parallel_size`

pipeline_parallel_size: int = 1

attr `expert_parallel_size`

expert_parallel_size: int = 1

attr `data_parallel_size`

data_parallel_size: int = 1

attr `async_engine`

async_engine: bool = True

attr `vllm_v1_disable_multiproc`

vllm_v1_disable_multiproc: bool = True

Sets VLLM_ENABLE_V1_MULTIPROCESSING=0 for reproducibility.

attr `enable_prefix_caching`

enable_prefix_caching: bool = True

attr `enable_chunked_prefill`

enable_chunked_prefill: bool = True

attr `enable_return_routed_experts`

enable_return_routed_experts: bool = False

attr `max_num_batched_tokens`

max_num_batched_tokens: int = 8192

attr `enforce_eager`

enforce_eager: bool = True

Disable CUDA graphs for stability. Set to False for higher performance, but this may affect convergence for long-running or long-context training jobs.

attr `fully_sharded_loras`

fully_sharded_loras: bool = False

attr `enable_ray_prometheus_stats`

enable_ray_prometheus_stats: bool = True

Enable Ray Prometheus stats logger for inference engine metrics (vLLM v1 only).

attr `gpu_memory_utilization`

gpu_memory_utilization: float = 0.8

attr `max_num_seqs`

max_num_seqs: int = 1024

attr `remote_urls`

remote_urls: List[str] = field(default_factory=(lambda: []))

attr `enable_http_endpoint`

enable_http_endpoint: bool = False

When True, launch an OpenAI-compatible HTTP endpoint for the inference engine client so that generators can send requests to this server instead of using .generate() Python calls.

NOTE: When using HTTP endpoints directly, make sure to set trainer.algorithm.temperature to the temperature used during generation

attr `http_endpoint_host`

http_endpoint_host: str = '127.0.0.1'

attr `http_endpoint_port`

http_endpoint_port: int = 8000

attr `served_model_name`

served_model_name: Optional[str] = None

Model name for HTTP endpoint validation. If set, must be used in the model field of /chat/completions requests instead of the model path. If None, the model path is used.

attr `distributed_executor_backend`

distributed_executor_backend: str = 'ray'

Distributed executor backend for vLLM. Set to "ray" to use the Ray backend or "mp" to use the multiprocessing backend (single-node serving only). Per-engine placement groups are created when "mp" is used.

attr `language_model_only`

language_model_only: bool = False

When True, pass language_model_only=True to the vLLM engine so that multimodal models (e.g. Qwen3.5) skip vision encoder initialization.

attr `engine_init_kwargs`

engine_init_kwargs: Dict[str, Any] = field(default_factory=dict)

Pass-through kwargs for the vLLM engine. Names must match the engine's args.

attr `override_existing_update_group`

override_existing_update_group: str = 'auto'

"auto", "enable", or "disable".

attr `external_proxy_url`

external_proxy_url: Optional[str] = None

Data-plane URL (load-balanced router) for the new inference layer.

attr `external_server_urls`

external_server_urls: Optional[List[str]] = None

Control-plane URLs (direct backend access) for the new inference layer.

attr `enable_pd`

enable_pd: bool = False

Enable prefill-decode disaggregation. Requires num_prefill > 0 and num_engines >= 2.

attr `num_prefill`

num_prefill: int = 0

Number of prefill engines when enable_pd=True. Decode engines = num_engines - num_prefill

NOTE: SkyRL counts data parallel workers separately, so the total number of prefill workers will be data_parallel_size * num_prefill.

attr `router_init_kwargs`

router_init_kwargs: Dict[str, Any] = field(default_factory=dict)

Pass-through kwargs applied to RouterArgs for the vllm-router. Names must match vllm_router.RouterArgs fields (e.g. policy, request_timeout_secs).

class `GeneratorConfig`

GeneratorConfig(inference_engine: InferenceEngineConfig = InferenceEngineConfig(), n_samples_per_prompt: int = 5, batched: bool = False, max_turns: int = 1, max_input_length: Optional[int] = None, chat_template: ChatTemplateConfig = ChatTemplateConfig(), chat_template_kwargs: Dict[str, Any] = dict(), sampling_params: SamplingParams = SamplingParams(), use_conversation_multi_turn: bool = True, append_eos_token_after_stop_str_in_multi_turn: bool = True, eval_sampling_params: Optional[SamplingParams] = None, eval_n_samples_per_prompt: int = 1, zero_reward_on_non_stop: bool = False, apply_overlong_filtering: bool = False, rope_scaling: Optional[Dict[str, Any]] = None, rope_theta: Optional[float] = None, step_wise_trajectories: bool = False, vision_language_generator: bool = False, merge_stepwise_output: bool = False) -> None

Bases: BaseConfig

Configuration for generation behavior.

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`inference_engine`	InferenceEngineConfig
`n_samples_per_prompt`	int
`batched`	bool
`max_turns`	int
`max_input_length`	Optional[int]	Max generator input length for multi-turn conversations. For single-turn, set equal to `max_prompt_length`.
`chat_template`	ChatTemplateConfig
`chat_template_kwargs`	Dict[str, Any]	Kwargs passed to `tokenizer.apply_chat_template`.
`sampling_params`	SamplingParams
`use_conversation_multi_turn`	bool	If `True`, each multi-turn model response and env observation is stored in a separate
`append_eos_token_after_stop_str_in_multi_turn`	bool	When `use_conversation_multi_turn=True` and `sampling_params.stop` is set, append
`eval_sampling_params`	Optional[SamplingParams]	Separate sampling params for evaluation. If `None`, then it defaults to `SamplingParams(temperature=0.0, max_generate_length=generator.sampling_params.max_generate_length)`.
`eval_n_samples_per_prompt`	int
`zero_reward_on_non_stop`	bool	Set reward to 0 when `stop_reason` is not `"stop"` (i.e., generation was truncated or aborted).
`apply_overlong_filtering`	bool	Apply DAPO Overlong Filtering: mask out all tokens in the loss mask for trajectories that
`rope_scaling`	Optional[Dict[str, Any]]	Can differ from the trainer's `rope_scaling`, useful for thinking models.
`rope_theta`	Optional[float]
`step_wise_trajectories`	bool
`vision_language_generator`	bool	If True, use SkyRLVLMGymGenerator (multi-modal text+image rollouts)
`merge_stepwise_output`	bool	When True (and step_wise_trajectories is True), apply prefix-aware merging

Source code in skyrl/train/config/config.py:582-625

@dataclass
class GeneratorConfig(BaseConfig):
    """Configuration for generation behavior."""

    inference_engine: InferenceEngineConfig = field(default_factory=InferenceEngineConfig)
    n_samples_per_prompt: int = 5
    batched: bool = False
    max_turns: int = 1
    max_input_length: Optional[int] = None
    """Max generator input length for multi-turn conversations. For single-turn, set equal to ``max_prompt_length``."""
    chat_template: ChatTemplateConfig = field(default_factory=ChatTemplateConfig)
    chat_template_kwargs: Dict[str, Any] = field(default_factory=dict)
    """Kwargs passed to ``tokenizer.apply_chat_template``."""
    sampling_params: SamplingParams = field(default_factory=SamplingParams)
    use_conversation_multi_turn: bool = True
    """If ``True``, each multi-turn model response and env observation is stored in a separate
    assistant/user message. If ``False``, they are appended to the original assistant response."""
    append_eos_token_after_stop_str_in_multi_turn: bool = True
    """When ``use_conversation_multi_turn=True`` and ``sampling_params.stop`` is set, append
    ``eos_token_id`` to generations that end with a matched stop string."""
    eval_sampling_params: Optional[SamplingParams] = None
    """Separate sampling params for evaluation. If ``None``, then it defaults to ``SamplingParams(temperature=0.0, max_generate_length=generator.sampling_params.max_generate_length)``."""
    eval_n_samples_per_prompt: int = 1
    zero_reward_on_non_stop: bool = False
    """Set reward to 0 when ``stop_reason`` is not ``"stop"`` (i.e., generation was truncated or aborted)."""
    apply_overlong_filtering: bool = False
    """Apply DAPO Overlong Filtering: mask out all tokens in the loss mask for trajectories that
    exceed max length (truncated, no EOS token)."""
    rope_scaling: Optional[Dict[str, Any]] = None
    """Can differ from the trainer's ``rope_scaling``, useful for thinking models."""
    rope_theta: Optional[float] = None
    step_wise_trajectories: bool = False
    vision_language_generator: bool = False
    """If True, use SkyRLVLMGymGenerator (multi-modal text+image rollouts)"""
    merge_stepwise_output: bool = False
    """When True (and step_wise_trajectories is True), apply prefix-aware merging
    to collapse multi-turn step-wise sequences into single sequences before training."""

    def __post_init__(self):

        if self.eval_sampling_params is None:
            self.eval_sampling_params = SamplingParams(
                temperature=0.0, max_generate_length=self.sampling_params.max_generate_length
            )

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `inference_engine`

inference_engine: InferenceEngineConfig = field(default_factory=InferenceEngineConfig)

attr `n_samples_per_prompt`

n_samples_per_prompt: int = 5

attr `batched`

batched: bool = False

attr `max_turns`

max_turns: int = 1

attr `max_input_length`

max_input_length: Optional[int] = None

Max generator input length for multi-turn conversations. For single-turn, set equal to max_prompt_length.

attr `chat_template`

chat_template: ChatTemplateConfig = field(default_factory=ChatTemplateConfig)

attr `chat_template_kwargs`

chat_template_kwargs: Dict[str, Any] = field(default_factory=dict)

Kwargs passed to tokenizer.apply_chat_template.

attr `sampling_params`

sampling_params: SamplingParams = field(default_factory=SamplingParams)

attr `use_conversation_multi_turn`

use_conversation_multi_turn: bool = True

If True, each multi-turn model response and env observation is stored in a separate assistant/user message. If False, they are appended to the original assistant response.

attr `append_eos_token_after_stop_str_in_multi_turn`

append_eos_token_after_stop_str_in_multi_turn: bool = True

When use_conversation_multi_turn=True and sampling_params.stop is set, append eos_token_id to generations that end with a matched stop string.

attr `eval_sampling_params`

eval_sampling_params: Optional[SamplingParams] = None

Separate sampling params for evaluation. If None, then it defaults to SamplingParams(temperature=0.0, max_generate_length=generator.sampling_params.max_generate_length).

attr `eval_n_samples_per_prompt`

eval_n_samples_per_prompt: int = 1

attr `zero_reward_on_non_stop`

zero_reward_on_non_stop: bool = False

Set reward to 0 when stop_reason is not "stop" (i.e., generation was truncated or aborted).

attr `apply_overlong_filtering`

apply_overlong_filtering: bool = False

Apply DAPO Overlong Filtering: mask out all tokens in the loss mask for trajectories that exceed max length (truncated, no EOS token).

attr `rope_scaling`

rope_scaling: Optional[Dict[str, Any]] = None

Can differ from the trainer's rope_scaling, useful for thinking models.

attr `rope_theta`

rope_theta: Optional[float] = None

attr `step_wise_trajectories`

step_wise_trajectories: bool = False

attr `vision_language_generator`

vision_language_generator: bool = False

If True, use SkyRLVLMGymGenerator (multi-modal text+image rollouts)

attr `merge_stepwise_output`

merge_stepwise_output: bool = False

When True (and step_wise_trajectories is True), apply prefix-aware merging to collapse multi-turn step-wise sequences into single sequences before training.

Environment

class `EnvironmentConfig`

EnvironmentConfig(env_class: str = 'gsm8k', skyrl_gym: SkyRLGymConfig = SkyRLGymConfig()) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`env_class`	str
`skyrl_gym`	SkyRLGymConfig

Source code in skyrl/train/config/config.py:648-651

@dataclass
class EnvironmentConfig(BaseConfig):
    env_class: str = "gsm8k"
    skyrl_gym: SkyRLGymConfig = field(default_factory=SkyRLGymConfig)

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `env_class`

env_class: str = 'gsm8k'

attr `skyrl_gym`

skyrl_gym: SkyRLGymConfig = field(default_factory=SkyRLGymConfig)

class `SkyRLGymConfig`

SkyRLGymConfig(max_env_workers: int = 32, text2sql: Text2SQLEnvConfig = Text2SQLEnvConfig(), llm_as_a_judge: GSM8kLLMJudgeEnvConfig = GSM8kLLMJudgeEnvConfig(), search: SearchEnvConfig = SearchEnvConfig()) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`max_env_workers`	int
`text2sql`	Text2SQLEnvConfig
`llm_as_a_judge`	GSM8kLLMJudgeEnvConfig
`search`	SearchEnvConfig

Source code in skyrl/train/config/config.py:640-645

@dataclass
class SkyRLGymConfig(BaseConfig):
    max_env_workers: int = 32
    text2sql: Text2SQLEnvConfig = field(default_factory=Text2SQLEnvConfig)
    llm_as_a_judge: GSM8kLLMJudgeEnvConfig = field(default_factory=GSM8kLLMJudgeEnvConfig)
    search: SearchEnvConfig = field(default_factory=SearchEnvConfig)

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `max_env_workers`

max_env_workers: int = 32

attr `text2sql`

text2sql: Text2SQLEnvConfig = field(default_factory=Text2SQLEnvConfig)

attr `llm_as_a_judge`

llm_as_a_judge: GSM8kLLMJudgeEnvConfig = field(default_factory=GSM8kLLMJudgeEnvConfig)

attr `search`

search: SearchEnvConfig = field(default_factory=SearchEnvConfig)

class `GSM8kLLMJudgeEnvConfig`

GSM8kLLMJudgeEnvConfig(model: str = 'gpt-4o-mini', base_url: Optional[str] = None) -> None

Bases: BaseConfig

Functions:

Name	Description
`from_dict_config`	Construct a typed BaseConfig from a Hydra DictConfig.

Attributes:

Name	Type	Description
`model`	str
`base_url`	Optional[str]

Source code in skyrl/train/config/config.py:634-637

@dataclass
class GSM8kLLMJudgeEnvConfig(BaseConfig):
    model: str = "gpt-4o-mini"
    base_url: Optional[str] = None

`from_dict_config`

from_dict_config(cfg: DictConfig) -> BaseConfig

Construct a typed BaseConfig from a Hydra DictConfig.

attr `model`

model: str = 'gpt-4o-mini'

attr `base_url`

base_url: Optional[str] = None