Rewards API¶
Reward functions define what "good" means for your training objective.
RewardFunction Base Class¶
from flux.rewards import RewardFunction, RewardOutput
class RewardFunction(ABC):
@abstractmethod
def compute_reward(self, trajectory: Trajectory) -> RewardOutput:
"""Compute reward for a trajectory."""
pass
@dataclass
class RewardOutput:
reward: float # Scalar reward [0, 1]
token_rewards: list[float] = [] # Optional per-token rewards
metadata: dict = {} # Optional metadata
Built-in Reward Functions¶
LengthReward¶
Rewards responses of target length.
from flux.rewards import LengthReward
reward = LengthReward(
target_length=200, # Target word count
tolerance=50, # Acceptable deviation
reward_type="gaussian", # "gaussian" or "linear"
)
FormatReward¶
Rewards structured responses.
from flux.rewards import FormatReward
reward = FormatReward(
required_sections=["Introduction", "Conclusion"],
forbidden_patterns=["I don't know", "As an AI"],
)
KeywordReward¶
Rewards keyword presence.
from flux.rewards import KeywordReward
reward = KeywordReward(
required_keywords=["because", "therefore"],
bonus_keywords=["example"],
penalty_keywords=["maybe"],
)
FunctionReward¶
Quick custom reward from a function.
from flux.rewards import FunctionReward
def my_scorer(trajectory):
return 1.0 if len(trajectory.response) > 100 else 0.5
reward = FunctionReward(fn=my_scorer)
CompositeReward¶
Combine multiple rewards.
from flux.rewards import CompositeReward
reward = CompositeReward([
(LengthReward(target=150), 0.3), # 30% weight
(KeywordReward(required=["because"]), 0.4),
(FormatReward(required_sections=["Answer"]), 0.3),
])
Model-Based Rewards¶
RewardModel¶
Neural network reward model.
from flux.rewards import RewardModel
reward = RewardModel(
model_path="OpenAssistant/reward-model-deberta",
device="cuda",
)
LLMJudge¶
LLM-as-judge reward.
from flux.rewards import LLMJudge
reward = LLMJudge(
judge_model="gpt-4",
prompt_template="Rate this response...",
api_key="...",
)
Creating Custom Rewards¶
from flux.rewards import RewardFunction, RewardOutput
from flux.core.trajectory import Trajectory
class QualityReward(RewardFunction):
def __init__(self, weights: dict = None):
self.weights = weights or {
"length": 0.3,
"structure": 0.3,
"clarity": 0.4,
}
def compute_reward(self, trajectory: Trajectory) -> RewardOutput:
response = trajectory.response
# Compute sub-scores
length_score = self._score_length(response)
structure_score = self._score_structure(response)
clarity_score = self._score_clarity(response)
# Weighted combination
score = (
self.weights["length"] * length_score +
self.weights["structure"] * structure_score +
self.weights["clarity"] * clarity_score
)
return RewardOutput(
reward=score,
metadata={
"length": length_score,
"structure": structure_score,
"clarity": clarity_score,
}
)
def _score_length(self, text: str) -> float:
words = len(text.split())
return min(1.0, words / 200)
def _score_structure(self, text: str) -> float:
has_structure = any(x in text for x in ["1.", "First", "•"])
return 1.0 if has_structure else 0.3
def _score_clarity(self, text: str) -> float:
# Simplified clarity heuristic
sentences = text.count(".")
if sentences < 2:
return 0.3
avg_words = len(text.split()) / sentences
return 1.0 if 10 <= avg_words <= 25 else 0.5
Best Practices¶
- Normalize to [0, 1]: Keeps gradients stable
- Avoid sparse rewards: Provide gradual feedback
- Log metadata: Helps debugging
- Test thoroughly: Verify reward distribution