Skip to content

Commit 56b672e

Browse files
committed
Allow users to define task groups in LMEvalJob
Add new field: TaskGroups under the TaskList to support custom task group. User can define a custom task group and specify a list of aggregate metrics. In the result JSON, the task groups have a dedicated section of their results. Signed-off-by: Yihong Wang <[email protected]>
1 parent bacdc4d commit 56b672e

File tree

8 files changed

+673
-23
lines changed

8 files changed

+673
-23
lines changed

api/lmes/v1alpha1/lmevaljob_types.go

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,10 @@ func (c *CustomArtifacts) GetTasks() []CustomArtifact {
184184
// Find details of the Unitxt Recipe here:
185185
// https://www.unitxt.ai/en/latest/unitxt.standard.html#unitxt.standard.StandardRecipe
186186
type TaskRecipe struct {
187+
// The name of the TaskRecipe
188+
// +optional
189+
// +kubebuilder:validation:Pattern=`^[a-zA-Z0-9._]+$`
190+
Name *string `json:"name,omitempty"`
187191
// The Unitxt dataset card
188192
Card Card `json:"card"`
189193
// The Unitxt template
@@ -241,12 +245,44 @@ type CustomTasks struct {
241245
Source CustomTaskSource `json:"source,omitempty"`
242246
}
243247

248+
// Define an aggregate metric using 'mean' aggregation.
249+
type AggregateMetric struct {
250+
// The name of the metric to aggregate
251+
// +kubebuilder:validation:Pattern=`^[a-zA-Z0-9._]+$`
252+
MetricName string `json:"metricName"`
253+
// Weight by size or not. Default value is True
254+
// +optional
255+
// +kubebuilder:default=true
256+
WeightBySize *bool `json:"weightBySize,omitempty"`
257+
}
258+
259+
// +kubebuilder:validation:XValidation:rule="has(self.taskNames) || has(self.taskRecipes)", message="One of taskNames or taskRecipes must be defined"
260+
type TaskGroup struct {
261+
// The name of the task group
262+
// +kubebuilder:validation:Pattern=`^[a-zA-Z0-9._]+$`
263+
Name string `json:"name"`
264+
// TaskNames from lm-eval's task list and/or from custom tasks if CustomTasks is defined
265+
// +optional
266+
// +kubebuilder:validation:items:Pattern=`^[a-zA-Z0-9._]+$`
267+
TaskNames []string `json:"taskNames,omitempty"`
268+
// Task Recipes specifically for the Unitxt tasks
269+
// +optional
270+
TaskRecipes []TaskRecipe `json:"taskRecipes,omitempty"`
271+
// A list of aggregate metrics to calculate for the task group
272+
// +optional
273+
AggregateMetrics []AggregateMetric `json:"aggregateMetrics,omitempty"`
274+
}
275+
276+
// +kubebuilder:validation:XValidation:rule="has(self.taskNames) || has(self.taskRecipes) || has(self.taskGroups)", message="One of taskNames, taskRecipes, or taskGroups must be defined"
277+
244278
type TaskList struct {
245279
// TaskNames from lm-eval's task list and/or from custom tasks if CustomTasks is defined
246-
// +kubebuilder:validation:items:Pattern=`^[a-zA-Z0-9._-]+$`
280+
// +kubebuilder:validation:items:Pattern=`^[a-zA-Z0-9._]+$`
247281
TaskNames []string `json:"taskNames,omitempty"`
248282
// Task Recipes specifically for Unitxt
249283
TaskRecipes []TaskRecipe `json:"taskRecipes,omitempty"`
284+
// Task Groups are a list of tasks that their metrics are aggregated in the result
285+
TaskGroups []TaskGroup `json:"taskGroups,omitempty"`
250286
// Custom Unitxt artifacts that can be used in a TaskRecipe
251287
CustomArtifacts *CustomArtifacts `json:"custom,omitempty"`
252288
// CustomTasks is a list of external tasks
@@ -346,6 +382,9 @@ func (t *TaskRecipe) String() string {
346382
if t.DemosPoolSize != nil {
347383
b.WriteString(fmt.Sprintf(",demos_pool_size=%d", *t.DemosPoolSize))
348384
}
385+
if t.Name != nil && *t.Name != "" {
386+
b.WriteString(fmt.Sprintf("|%s", *t.Name))
387+
}
349388
return b.String()
350389
}
351390

api/lmes/v1alpha1/zz_generated.deepcopy.go

Lines changed: 66 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/lmes_driver/main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ func (t *strArrayArg) String() string {
5151

5252
var (
5353
taskRecipes strArrayArg
54+
taskGroups strArrayArg
5455
customArtifactArgs strArrayArg
5556
taskNames strArrayArg
5657
copy = flag.String("copy", "", "copy this binary to specified destination path")
@@ -70,6 +71,7 @@ var (
7071

7172
func init() {
7273
flag.Var(&taskRecipes, "task-recipe", "task recipe")
74+
flag.Var(&taskGroups, "task-group", "task group")
7375
flag.Var(&customArtifactArgs, "custom-artifact", "A string contains an artifact's type, name and value. Use | as separator")
7476
flag.Var(&taskNames, "task-name", "A task name for custom tasks")
7577
}
@@ -125,6 +127,7 @@ func main() {
125127
DetectDevice: *detectDevice,
126128
Logger: driverLog,
127129
TaskRecipes: taskRecipes,
130+
TaskGroups: taskGroups,
128131
CustomArtifacts: customArtifacts,
129132
Args: args,
130133
CommPort: *commPort,

config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml

Lines changed: 151 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4851,11 +4851,154 @@ spec:
48514851
type: object
48524852
type: object
48534853
type: object
4854+
taskGroups:
4855+
description: Task Groups are a list of tasks that their metrics
4856+
are aggregated in the result
4857+
items:
4858+
properties:
4859+
aggregateMetrics:
4860+
description: A list of aggregate metrics to calculate for
4861+
the task group
4862+
items:
4863+
description: Define an aggregate metric using 'mean' aggregation.
4864+
properties:
4865+
metricName:
4866+
description: The name of the metric to aggregate
4867+
pattern: ^[a-zA-Z0-9._]+$
4868+
type: string
4869+
weightBySize:
4870+
default: true
4871+
description: Weight by size or not. Default value
4872+
is True
4873+
type: boolean
4874+
required:
4875+
- metricName
4876+
type: object
4877+
type: array
4878+
name:
4879+
description: The name of the task group
4880+
pattern: ^[a-zA-Z0-9._]+$
4881+
type: string
4882+
taskNames:
4883+
description: TaskNames from lm-eval's task list and/or from
4884+
custom tasks if CustomTasks is defined
4885+
items:
4886+
pattern: ^[a-zA-Z0-9._]+$
4887+
type: string
4888+
type: array
4889+
taskRecipes:
4890+
description: Task Recipes specifically for the Unitxt tasks
4891+
items:
4892+
description: |-
4893+
Use a task recipe to form a custom task. It maps to the Unitxt Recipe
4894+
Find details of the Unitxt Recipe here:
4895+
https://www.unitxt.ai/en/latest/unitxt.standard.html#unitxt.standard.StandardRecipe
4896+
properties:
4897+
card:
4898+
description: The Unitxt dataset card
4899+
properties:
4900+
custom:
4901+
description: |-
4902+
A JSON string for a custom unitxt card which contains the custom dataset.
4903+
Use the documentation here: https://www.unitxt.ai/en/latest/docs/adding_dataset.html#adding-to-the-catalog
4904+
to compose a custom card, store it as a JSON file, and use the JSON content as the value here.
4905+
type: string
4906+
name:
4907+
description: Unitxt card's ID
4908+
pattern: ^[a-zA-Z0-9._-]+$
4909+
type: string
4910+
type: object
4911+
demosPoolSize:
4912+
description: The pool size for the fewshot
4913+
type: integer
4914+
format:
4915+
description: The Unitxt format
4916+
type: string
4917+
loaderLimit:
4918+
description: A limit number of records to load
4919+
type: integer
4920+
metrics:
4921+
description: Metrics
4922+
items:
4923+
properties:
4924+
name:
4925+
description: Unitxt metric id
4926+
type: string
4927+
ref:
4928+
description: |-
4929+
The name of the custom metric in the custom field. Its value is a JSON string
4930+
for a custom Unitxt metric. Use the documentation here: https://www.unitxt.ai/en/latest/docs/adding_metric.html#adding-a-new-instance-metric
4931+
to compose a custom metric, store it as a JSON file by calling the
4932+
add_to_catalog API: https://www.unitxt.ai/en/latest/docs/saving_and_loading_from_catalog.html#adding-assets-to-the-catalog,
4933+
and use the JSON content as the value here.
4934+
type: string
4935+
type: object
4936+
type: array
4937+
name:
4938+
description: The name of the TaskRecipe
4939+
pattern: ^[a-zA-Z0-9._]+$
4940+
type: string
4941+
numDemos:
4942+
description: Number of fewshot
4943+
type: integer
4944+
systemPrompt:
4945+
description: The Unitxt System Prompt
4946+
properties:
4947+
name:
4948+
description: Unitxt System Prompt id
4949+
type: string
4950+
ref:
4951+
description: The name of the custom systemPrompt
4952+
in the custom field. Its value is a custom system
4953+
prompt string
4954+
type: string
4955+
type: object
4956+
task:
4957+
description: The Unitxt Task
4958+
properties:
4959+
name:
4960+
description: Unitxt task id
4961+
type: string
4962+
ref:
4963+
description: |-
4964+
The name of the custom task in the custom field. Its value is a JSON string
4965+
for a custom Unitxt task. Use the documentation here: https://www.unitxt.ai/en/latest/docs/adding_task.html
4966+
to compose a custom task, store it as a JSON file by calling the
4967+
add_to_catalog API: https://www.unitxt.ai/en/latest/docs/saving_and_loading_from_catalog.html#adding-assets-to-the-catalog,
4968+
and use the JSON content as the value here.
4969+
type: string
4970+
type: object
4971+
template:
4972+
description: The Unitxt template
4973+
properties:
4974+
name:
4975+
description: Unitxt template ID
4976+
type: string
4977+
ref:
4978+
description: |-
4979+
The name of the custom template in the custom field. Its value is a JSON string
4980+
for a custom Unitxt template. Use the documentation here: https://www.unitxt.ai/en/latest/docs/adding_template.html
4981+
to compose a custom template, store it as a JSON file by calling the
4982+
add_to_catalog API: https://www.unitxt.ai/en/latest/docs/saving_and_loading_from_catalog.html#adding-assets-to-the-catalog,
4983+
and use the JSON content as the value here.
4984+
type: string
4985+
type: object
4986+
required:
4987+
- card
4988+
type: object
4989+
type: array
4990+
required:
4991+
- name
4992+
type: object
4993+
x-kubernetes-validations:
4994+
- message: One of taskNames or taskRecipes must be defined
4995+
rule: has(self.taskNames) || has(self.taskRecipes)
4996+
type: array
48544997
taskNames:
48554998
description: TaskNames from lm-eval's task list and/or from custom
48564999
tasks if CustomTasks is defined
48575000
items:
4858-
pattern: ^[a-zA-Z0-9._-]+$
5001+
pattern: ^[a-zA-Z0-9._]+$
48595002
type: string
48605003
type: array
48615004
taskRecipes:
@@ -4906,6 +5049,10 @@ spec:
49065049
type: string
49075050
type: object
49085051
type: array
5052+
name:
5053+
description: The name of the TaskRecipe
5054+
pattern: ^[a-zA-Z0-9._]+$
5055+
type: string
49095056
numDemos:
49105057
description: Number of fewshot
49115058
type: integer
@@ -4956,6 +5103,9 @@ spec:
49565103
type: object
49575104
type: array
49585105
type: object
5106+
x-kubernetes-validations:
5107+
- message: One of taskNames, taskRecipes, or taskGroups must be defined
5108+
rule: has(self.taskNames) || has(self.taskRecipes) || has(self.taskGroups)
49595109
required:
49605110
- model
49615111
- taskList

0 commit comments

Comments
 (0)