跳转到内容

pydantic_evals.reporting

ReportCase dataclass

基类:Generic[InputsT, OutputT, MetadataT]

评估报告中的单个案例。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
@dataclass(kw_only=True)
class ReportCase(Generic[InputsT, OutputT, MetadataT]):
    """A single case in an evaluation report."""

    name: str
    """The name of the [case][pydantic_evals.Case]."""
    inputs: InputsT
    """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
    metadata: MetadataT | None
    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
    expected_output: OutputT | None
    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
    output: OutputT
    """The output of the task execution."""

    metrics: dict[str, float | int]
    attributes: dict[str, Any]

    scores: dict[str, EvaluationResult[int | float]]
    labels: dict[str, EvaluationResult[str]]
    assertions: dict[str, EvaluationResult[bool]]

    task_duration: float
    total_duration: float  # includes evaluator execution time

    trace_id: str | None = None
    """The trace ID of the case span."""
    span_id: str | None = None
    """The span ID of the case span."""

    evaluator_failures: list[EvaluatorFailure] = field(default_factory=list)

name instance-attribute

name: str

案例的名称。

inputs instance-attribute

inputs: InputsT

任务的输入,来自Case.inputs

metadata 实例属性

metadata: MetadataT | None

与案例相关的任何元数据,来自Case.metadata

expected_output instance-attribute

expected_output: OutputT | None

任务的预期输出,来自Case.expected_output

output instance-attribute

output: OutputT

任务执行的输出。

trace_id class-attribute instance-attribute

trace_id: str | None = None

案例跨度的追踪 ID。

span_id class-attribute instance-attribute

span_id: str | None = None

案例跨度的跨度 ID。

ReportCaseFailure dataclass

基类:Generic[InputsT, OutputT, MetadataT]

评估报告中因任务执行期间出错而失败的单个案例。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
@dataclass(kw_only=True)
class ReportCaseFailure(Generic[InputsT, OutputT, MetadataT]):
    """A single case in an evaluation report that failed due to an error during task execution."""

    name: str
    """The name of the [case][pydantic_evals.Case]."""
    inputs: InputsT
    """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
    metadata: MetadataT | None
    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
    expected_output: OutputT | None
    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""

    error_message: str
    """The message of the exception that caused the failure."""
    error_stacktrace: str
    """The stacktrace of the exception that caused the failure."""

    trace_id: str | None = None
    """The trace ID of the case span."""
    span_id: str | None = None
    """The span ID of the case span."""

name instance-attribute

name: str

案例的名称。

inputs instance-attribute

inputs: InputsT

任务的输入,来自Case.inputs

metadata 实例属性

metadata: MetadataT | None

与案例相关的任何元数据,来自Case.metadata

expected_output instance-attribute

expected_output: OutputT | None

任务的预期输出,来自Case.expected_output

error_message instance-attribute

error_message: str

导致失败的异常信息。

error_stacktrace instance-attribute

error_stacktrace: str

导致失败的异常的堆栈跟踪。

trace_id class-attribute instance-attribute

trace_id: str | None = None

案例跨度的追踪 ID。

span_id class-attribute instance-attribute

span_id: str | None = None

案例跨度的跨度 ID。

ReportCaseAggregate

基类:BaseModel

一个综合性案例,用于总结一组案例。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
class ReportCaseAggregate(BaseModel):
    """A synthetic case that summarizes a set of cases."""

    name: str

    scores: dict[str, float | int]
    labels: dict[str, dict[str, float]]
    metrics: dict[str, float | int]
    assertions: float | None
    task_duration: float
    total_duration: float

    @staticmethod
    def average(cases: list[ReportCase]) -> ReportCaseAggregate:
        """Produce a synthetic "summary" case by averaging quantitative attributes."""
        num_cases = len(cases)
        if num_cases == 0:
            return ReportCaseAggregate(
                name='Averages',
                scores={},
                labels={},
                metrics={},
                assertions=None,
                task_duration=0.0,
                total_duration=0.0,
            )

        def _scores_averages(scores_by_name: list[dict[str, int | float | bool]]) -> dict[str, float]:
            counts_by_name: dict[str, int] = defaultdict(int)
            sums_by_name: dict[str, float] = defaultdict(float)
            for sbn in scores_by_name:
                for name, score in sbn.items():
                    counts_by_name[name] += 1
                    sums_by_name[name] += score
            return {name: sums_by_name[name] / counts_by_name[name] for name in sums_by_name}

        def _labels_averages(labels_by_name: list[dict[str, str]]) -> dict[str, dict[str, float]]:
            counts_by_name: dict[str, int] = defaultdict(int)
            sums_by_name: dict[str, dict[str, float]] = defaultdict(lambda: defaultdict(float))
            for lbn in labels_by_name:
                for name, label in lbn.items():
                    counts_by_name[name] += 1
                    sums_by_name[name][label] += 1
            return {
                name: {value: count / counts_by_name[name] for value, count in sums_by_name[name].items()}
                for name in sums_by_name
            }

        average_task_duration = sum(case.task_duration for case in cases) / num_cases
        average_total_duration = sum(case.total_duration for case in cases) / num_cases

        # average_assertions: dict[str, float] = _scores_averages([{k: v.value for k, v in case.scores.items()} for case in cases])
        average_scores: dict[str, float] = _scores_averages(
            [{k: v.value for k, v in case.scores.items()} for case in cases]
        )
        average_labels: dict[str, dict[str, float]] = _labels_averages(
            [{k: v.value for k, v in case.labels.items()} for case in cases]
        )
        average_metrics: dict[str, float] = _scores_averages([case.metrics for case in cases])

        average_assertions: float | None = None
        n_assertions = sum(len(case.assertions) for case in cases)
        if n_assertions > 0:
            n_passing = sum(1 for case in cases for assertion in case.assertions.values() if assertion.value)
            average_assertions = n_passing / n_assertions

        return ReportCaseAggregate(
            name='Averages',
            scores=average_scores,
            labels=average_labels,
            metrics=average_metrics,
            assertions=average_assertions,
            task_duration=average_task_duration,
            total_duration=average_total_duration,
        )

average staticmethod

average(cases: list[ReportCase]) -> ReportCaseAggregate

通过对定量属性求平均值来生成一个综合性的“摘要”案例。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
@staticmethod
def average(cases: list[ReportCase]) -> ReportCaseAggregate:
    """Produce a synthetic "summary" case by averaging quantitative attributes."""
    num_cases = len(cases)
    if num_cases == 0:
        return ReportCaseAggregate(
            name='Averages',
            scores={},
            labels={},
            metrics={},
            assertions=None,
            task_duration=0.0,
            total_duration=0.0,
        )

    def _scores_averages(scores_by_name: list[dict[str, int | float | bool]]) -> dict[str, float]:
        counts_by_name: dict[str, int] = defaultdict(int)
        sums_by_name: dict[str, float] = defaultdict(float)
        for sbn in scores_by_name:
            for name, score in sbn.items():
                counts_by_name[name] += 1
                sums_by_name[name] += score
        return {name: sums_by_name[name] / counts_by_name[name] for name in sums_by_name}

    def _labels_averages(labels_by_name: list[dict[str, str]]) -> dict[str, dict[str, float]]:
        counts_by_name: dict[str, int] = defaultdict(int)
        sums_by_name: dict[str, dict[str, float]] = defaultdict(lambda: defaultdict(float))
        for lbn in labels_by_name:
            for name, label in lbn.items():
                counts_by_name[name] += 1
                sums_by_name[name][label] += 1
        return {
            name: {value: count / counts_by_name[name] for value, count in sums_by_name[name].items()}
            for name in sums_by_name
        }

    average_task_duration = sum(case.task_duration for case in cases) / num_cases
    average_total_duration = sum(case.total_duration for case in cases) / num_cases

    # average_assertions: dict[str, float] = _scores_averages([{k: v.value for k, v in case.scores.items()} for case in cases])
    average_scores: dict[str, float] = _scores_averages(
        [{k: v.value for k, v in case.scores.items()} for case in cases]
    )
    average_labels: dict[str, dict[str, float]] = _labels_averages(
        [{k: v.value for k, v in case.labels.items()} for case in cases]
    )
    average_metrics: dict[str, float] = _scores_averages([case.metrics for case in cases])

    average_assertions: float | None = None
    n_assertions = sum(len(case.assertions) for case in cases)
    if n_assertions > 0:
        n_passing = sum(1 for case in cases for assertion in case.assertions.values() if assertion.value)
        average_assertions = n_passing / n_assertions

    return ReportCaseAggregate(
        name='Averages',
        scores=average_scores,
        labels=average_labels,
        metrics=average_metrics,
        assertions=average_assertions,
        task_duration=average_task_duration,
        total_duration=average_total_duration,
    )

EvaluationReport dataclass

基类:Generic[InputsT, OutputT, MetadataT]

一份关于在一组案例上评估模型结果的报告。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
@dataclass(kw_only=True)
class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
    """A report of the results of evaluating a model on a set of cases."""

    name: str
    """The name of the report."""

    cases: list[ReportCase[InputsT, OutputT, MetadataT]]
    """The cases in the report."""
    failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
    """The failures in the report. These are cases where task execution raised an exception."""

    trace_id: str | None = None
    """The trace ID of the evaluation."""
    span_id: str | None = None
    """The span ID of the evaluation."""

    def averages(self) -> ReportCaseAggregate | None:
        if self.cases:
            return ReportCaseAggregate.average(self.cases)
        return None

    def print(
        self,
        width: int | None = None,
        baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
        *,
        include_input: bool = False,
        include_metadata: bool = False,
        include_expected_output: bool = False,
        include_output: bool = False,
        include_durations: bool = True,
        include_total_duration: bool = False,
        include_removed_cases: bool = False,
        include_averages: bool = True,
        include_errors: bool = True,
        include_error_stacktrace: bool = False,
        include_evaluator_failures: bool = True,
        input_config: RenderValueConfig | None = None,
        metadata_config: RenderValueConfig | None = None,
        output_config: RenderValueConfig | None = None,
        score_configs: dict[str, RenderNumberConfig] | None = None,
        label_configs: dict[str, RenderValueConfig] | None = None,
        metric_configs: dict[str, RenderNumberConfig] | None = None,
        duration_config: RenderNumberConfig | None = None,
        include_reasons: bool = False,
    ):  # pragma: no cover
        """Print this report to the console, optionally comparing it to a baseline report.

        If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
        """
        table = self.console_table(
            baseline=baseline,
            include_input=include_input,
            include_metadata=include_metadata,
            include_expected_output=include_expected_output,
            include_output=include_output,
            include_durations=include_durations,
            include_total_duration=include_total_duration,
            include_removed_cases=include_removed_cases,
            include_averages=include_averages,
            include_evaluator_failures=include_evaluator_failures,
            input_config=input_config,
            metadata_config=metadata_config,
            output_config=output_config,
            score_configs=score_configs,
            label_configs=label_configs,
            metric_configs=metric_configs,
            duration_config=duration_config,
            include_reasons=include_reasons,
        )
        console = Console(width=width)
        console.print(table)
        if include_errors and self.failures:
            failures_table = self.failures_table(
                include_input=include_input,
                include_metadata=include_metadata,
                include_expected_output=include_expected_output,
                include_error_message=True,
                include_error_stacktrace=include_error_stacktrace,
                input_config=input_config,
                metadata_config=metadata_config,
            )
            console.print(failures_table, style='red')

    def console_table(
        self,
        baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
        *,
        include_input: bool = False,
        include_metadata: bool = False,
        include_expected_output: bool = False,
        include_output: bool = False,
        include_durations: bool = True,
        include_total_duration: bool = False,
        include_removed_cases: bool = False,
        include_averages: bool = True,
        include_evaluator_failures: bool = True,
        input_config: RenderValueConfig | None = None,
        metadata_config: RenderValueConfig | None = None,
        output_config: RenderValueConfig | None = None,
        score_configs: dict[str, RenderNumberConfig] | None = None,
        label_configs: dict[str, RenderValueConfig] | None = None,
        metric_configs: dict[str, RenderNumberConfig] | None = None,
        duration_config: RenderNumberConfig | None = None,
        include_reasons: bool = False,
    ) -> Table:
        """Return a table containing the data from this report, or the diff between this report and a baseline report.

        Optionally include input and output details.
        """
        renderer = EvaluationRenderer(
            include_input=include_input,
            include_metadata=include_metadata,
            include_expected_output=include_expected_output,
            include_output=include_output,
            include_durations=include_durations,
            include_total_duration=include_total_duration,
            include_removed_cases=include_removed_cases,
            include_averages=include_averages,
            include_error_message=False,
            include_error_stacktrace=False,
            include_evaluator_failures=include_evaluator_failures,
            input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})},
            metadata_config={**_DEFAULT_VALUE_CONFIG, **(metadata_config or {})},
            output_config=output_config or _DEFAULT_VALUE_CONFIG,
            score_configs=score_configs or {},
            label_configs=label_configs or {},
            metric_configs=metric_configs or {},
            duration_config=duration_config or _DEFAULT_DURATION_CONFIG,
            include_reasons=include_reasons,
        )
        if baseline is None:
            return renderer.build_table(self)
        else:  # pragma: no cover
            return renderer.build_diff_table(self, baseline)

    def failures_table(
        self,
        *,
        include_input: bool = False,
        include_metadata: bool = False,
        include_expected_output: bool = False,
        include_error_message: bool = True,
        include_error_stacktrace: bool = True,
        input_config: RenderValueConfig | None = None,
        metadata_config: RenderValueConfig | None = None,
    ) -> Table:
        """Return a table containing the failures in this report."""
        renderer = EvaluationRenderer(
            include_input=include_input,
            include_metadata=include_metadata,
            include_expected_output=include_expected_output,
            include_output=False,
            include_durations=False,
            include_total_duration=False,
            include_removed_cases=False,
            include_averages=False,
            input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})},
            metadata_config={**_DEFAULT_VALUE_CONFIG, **(metadata_config or {})},
            output_config=_DEFAULT_VALUE_CONFIG,
            score_configs={},
            label_configs={},
            metric_configs={},
            duration_config=_DEFAULT_DURATION_CONFIG,
            include_reasons=False,
            include_error_message=include_error_message,
            include_error_stacktrace=include_error_stacktrace,
            include_evaluator_failures=False,  # Not applicable for failures table
        )
        return renderer.build_failures_table(self)

    def __str__(self) -> str:  # pragma: lax no cover
        """Return a string representation of the report."""
        table = self.console_table()
        io_file = StringIO()
        Console(file=io_file).print(table)
        return io_file.getvalue()

name instance-attribute

name: str

报告的名称。

cases instance-attribute

cases: list[ReportCase[InputsT, OutputT, MetadataT]]

报告中的案例。

failures class-attribute instance-attribute

failures: list[
    ReportCaseFailure[InputsT, OutputT, MetadataT]
] = field(default_factory=list)

报告中的失败案例。这些是任务执行时引发异常的案例。

trace_id class-attribute instance-attribute

trace_id: str | None = None

评估的追踪 ID。

span_id class-attribute instance-attribute

span_id: str | None = None

评估的跨度 ID。

print

print(
    width: int | None = None,
    baseline: (
        EvaluationReport[InputsT, OutputT, MetadataT] | None
    ) = None,
    *,
    include_input: bool = False,
    include_metadata: bool = False,
    include_expected_output: bool = False,
    include_output: bool = False,
    include_durations: bool = True,
    include_total_duration: bool = False,
    include_removed_cases: bool = False,
    include_averages: bool = True,
    include_errors: bool = True,
    include_error_stacktrace: bool = False,
    include_evaluator_failures: bool = True,
    input_config: RenderValueConfig | None = None,
    metadata_config: RenderValueConfig | None = None,
    output_config: RenderValueConfig | None = None,
    score_configs: (
        dict[str, RenderNumberConfig] | None
    ) = None,
    label_configs: (
        dict[str, RenderValueConfig] | None
    ) = None,
    metric_configs: (
        dict[str, RenderNumberConfig] | None
    ) = None,
    duration_config: RenderNumberConfig | None = None,
    include_reasons: bool = False
)

将此报告打印到控制台,可选择与基准报告进行比较。

如果您想对输出有更多控制,请改用 console_table 并将其传递给 rich.Console.print

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def print(
    self,
    width: int | None = None,
    baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
    *,
    include_input: bool = False,
    include_metadata: bool = False,
    include_expected_output: bool = False,
    include_output: bool = False,
    include_durations: bool = True,
    include_total_duration: bool = False,
    include_removed_cases: bool = False,
    include_averages: bool = True,
    include_errors: bool = True,
    include_error_stacktrace: bool = False,
    include_evaluator_failures: bool = True,
    input_config: RenderValueConfig | None = None,
    metadata_config: RenderValueConfig | None = None,
    output_config: RenderValueConfig | None = None,
    score_configs: dict[str, RenderNumberConfig] | None = None,
    label_configs: dict[str, RenderValueConfig] | None = None,
    metric_configs: dict[str, RenderNumberConfig] | None = None,
    duration_config: RenderNumberConfig | None = None,
    include_reasons: bool = False,
):  # pragma: no cover
    """Print this report to the console, optionally comparing it to a baseline report.

    If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
    """
    table = self.console_table(
        baseline=baseline,
        include_input=include_input,
        include_metadata=include_metadata,
        include_expected_output=include_expected_output,
        include_output=include_output,
        include_durations=include_durations,
        include_total_duration=include_total_duration,
        include_removed_cases=include_removed_cases,
        include_averages=include_averages,
        include_evaluator_failures=include_evaluator_failures,
        input_config=input_config,
        metadata_config=metadata_config,
        output_config=output_config,
        score_configs=score_configs,
        label_configs=label_configs,
        metric_configs=metric_configs,
        duration_config=duration_config,
        include_reasons=include_reasons,
    )
    console = Console(width=width)
    console.print(table)
    if include_errors and self.failures:
        failures_table = self.failures_table(
            include_input=include_input,
            include_metadata=include_metadata,
            include_expected_output=include_expected_output,
            include_error_message=True,
            include_error_stacktrace=include_error_stacktrace,
            input_config=input_config,
            metadata_config=metadata_config,
        )
        console.print(failures_table, style='red')

console_table

console_table(
    baseline: (
        EvaluationReport[InputsT, OutputT, MetadataT] | None
    ) = None,
    *,
    include_input: bool = False,
    include_metadata: bool = False,
    include_expected_output: bool = False,
    include_output: bool = False,
    include_durations: bool = True,
    include_total_duration: bool = False,
    include_removed_cases: bool = False,
    include_averages: bool = True,
    include_evaluator_failures: bool = True,
    input_config: RenderValueConfig | None = None,
    metadata_config: RenderValueConfig | None = None,
    output_config: RenderValueConfig | None = None,
    score_configs: (
        dict[str, RenderNumberConfig] | None
    ) = None,
    label_configs: (
        dict[str, RenderValueConfig] | None
    ) = None,
    metric_configs: (
        dict[str, RenderNumberConfig] | None
    ) = None,
    duration_config: RenderNumberConfig | None = None,
    include_reasons: bool = False
) -> Table

返回一个包含此报告数据的表格,或此报告与基准报告之间的差异。

可选择性地包含输入和输出的详细信息。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
def console_table(
    self,
    baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
    *,
    include_input: bool = False,
    include_metadata: bool = False,
    include_expected_output: bool = False,
    include_output: bool = False,
    include_durations: bool = True,
    include_total_duration: bool = False,
    include_removed_cases: bool = False,
    include_averages: bool = True,
    include_evaluator_failures: bool = True,
    input_config: RenderValueConfig | None = None,
    metadata_config: RenderValueConfig | None = None,
    output_config: RenderValueConfig | None = None,
    score_configs: dict[str, RenderNumberConfig] | None = None,
    label_configs: dict[str, RenderValueConfig] | None = None,
    metric_configs: dict[str, RenderNumberConfig] | None = None,
    duration_config: RenderNumberConfig | None = None,
    include_reasons: bool = False,
) -> Table:
    """Return a table containing the data from this report, or the diff between this report and a baseline report.

    Optionally include input and output details.
    """
    renderer = EvaluationRenderer(
        include_input=include_input,
        include_metadata=include_metadata,
        include_expected_output=include_expected_output,
        include_output=include_output,
        include_durations=include_durations,
        include_total_duration=include_total_duration,
        include_removed_cases=include_removed_cases,
        include_averages=include_averages,
        include_error_message=False,
        include_error_stacktrace=False,
        include_evaluator_failures=include_evaluator_failures,
        input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})},
        metadata_config={**_DEFAULT_VALUE_CONFIG, **(metadata_config or {})},
        output_config=output_config or _DEFAULT_VALUE_CONFIG,
        score_configs=score_configs or {},
        label_configs=label_configs or {},
        metric_configs=metric_configs or {},
        duration_config=duration_config or _DEFAULT_DURATION_CONFIG,
        include_reasons=include_reasons,
    )
    if baseline is None:
        return renderer.build_table(self)
    else:  # pragma: no cover
        return renderer.build_diff_table(self, baseline)

failures_table

failures_table(
    *,
    include_input: bool = False,
    include_metadata: bool = False,
    include_expected_output: bool = False,
    include_error_message: bool = True,
    include_error_stacktrace: bool = True,
    input_config: RenderValueConfig | None = None,
    metadata_config: RenderValueConfig | None = None
) -> Table

返回一个包含此报告中失败案例的表格。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
def failures_table(
    self,
    *,
    include_input: bool = False,
    include_metadata: bool = False,
    include_expected_output: bool = False,
    include_error_message: bool = True,
    include_error_stacktrace: bool = True,
    input_config: RenderValueConfig | None = None,
    metadata_config: RenderValueConfig | None = None,
) -> Table:
    """Return a table containing the failures in this report."""
    renderer = EvaluationRenderer(
        include_input=include_input,
        include_metadata=include_metadata,
        include_expected_output=include_expected_output,
        include_output=False,
        include_durations=False,
        include_total_duration=False,
        include_removed_cases=False,
        include_averages=False,
        input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})},
        metadata_config={**_DEFAULT_VALUE_CONFIG, **(metadata_config or {})},
        output_config=_DEFAULT_VALUE_CONFIG,
        score_configs={},
        label_configs={},
        metric_configs={},
        duration_config=_DEFAULT_DURATION_CONFIG,
        include_reasons=False,
        include_error_message=include_error_message,
        include_error_stacktrace=include_error_stacktrace,
        include_evaluator_failures=False,  # Not applicable for failures table
    )
    return renderer.build_failures_table(self)

__str__

__str__() -> str

返回报告的字符串表示形式。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
359
360
361
362
363
364
def __str__(self) -> str:  # pragma: lax no cover
    """Return a string representation of the report."""
    table = self.console_table()
    io_file = StringIO()
    Console(file=io_file).print(table)
    return io_file.getvalue()

RenderValueConfig

基类:TypedDict

用于在评估报告中渲染值的配置。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
370
371
372
373
374
375
376
class RenderValueConfig(TypedDict, total=False):
    """A configuration for rendering a values in an Evaluation report."""

    value_formatter: str | Callable[[Any], str]
    diff_checker: Callable[[Any, Any], bool] | None
    diff_formatter: Callable[[Any, Any], str | None] | None
    diff_style: str

RenderNumberConfig

基类:TypedDict

用于在评估报告中渲染特定分数或指标的配置。

请参阅 _RenderNumber 的实现以更清楚地了解这些参数如何影响渲染。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
class RenderNumberConfig(TypedDict, total=False):
    """A configuration for rendering a particular score or metric in an Evaluation report.

    See the implementation of `_RenderNumber` for more clarity on how these parameters affect the rendering.
    """

    value_formatter: str | Callable[[float | int], str]
    """The logic to use for formatting values.

    * If not provided, format as ints if all values are ints, otherwise at least one decimal place and at least four significant figures.
    * You can also use a custom string format spec, e.g. '{:.3f}'
    * You can also use a custom function, e.g. lambda x: f'{x:.3f}'
    """
    diff_formatter: str | Callable[[float | int, float | int], str | None] | None
    """The logic to use for formatting details about the diff.

    The strings produced by the value_formatter will always be included in the reports, but the diff_formatter is
    used to produce additional text about the difference between the old and new values, such as the absolute or
    relative difference.

    * If not provided, format as ints if all values are ints, otherwise at least one decimal place and at least four
        significant figures, and will include the percentage change.
    * You can also use a custom string format spec, e.g. '{:+.3f}'
    * You can also use a custom function, e.g. lambda x: f'{x:+.3f}'.
        If this function returns None, no extra diff text will be added.
    * You can also use None to never generate extra diff text.
    """
    diff_atol: float
    """The absolute tolerance for considering a difference "significant".

    A difference is "significant" if `abs(new - old) < self.diff_atol + self.diff_rtol * abs(old)`.

    If a difference is not significant, it will not have the diff styles applied. Note that we still show
    both the rendered before and after values in the diff any time they differ, even if the difference is not
    significant. (If the rendered values are exactly the same, we only show the value once.)

    If not provided, use 1e-6.
    """
    diff_rtol: float
    """The relative tolerance for considering a difference "significant".

    See the description of `diff_atol` for more details about what makes a difference "significant".

    If not provided, use 0.001 if all values are ints, otherwise 0.05.
    """
    diff_increase_style: str
    """The style to apply to diffed values that have a significant increase.

    See the description of `diff_atol` for more details about what makes a difference "significant".

    If not provided, use green for scores and red for metrics. You can also use arbitrary `rich` styles, such as "bold red".
    """
    diff_decrease_style: str
    """The style to apply to diffed values that have significant decrease.

    See the description of `diff_atol` for more details about what makes a difference "significant".

    If not provided, use red for scores and green for metrics. You can also use arbitrary `rich` styles, such as "bold red".
    """

value_formatter instance-attribute

value_formatter: str | Callable[[float | int], str]

用于格式化值的逻辑。

  • 如果未提供,当所有值都是整数时格式化为整数,否则至少保留一位小数和至少四位有效数字。
  • 您也可以使用自定义字符串格式规范,例如 '{:.3f}'
  • 您也可以使用自定义函数,例如 lambda x: f'{x:.3f}'

diff_formatter instance-attribute

diff_formatter: (
    str
    | Callable[[float | int, float | int], str | None]
    | None
)

用于格式化差异详情的逻辑。

由 value_formatter 生成的字符串将始终包含在报告中,但 diff_formatter 用于生成关于新旧值之间差异的附加文本,例如绝对或相对差异。

  • 如果未提供,当所有值都是整数时格式化为整数,否则至少保留一位小数和至少四位有效数字,并且会包含百分比变化。
  • 您也可以使用自定义字符串格式规范,例如 '{:+.3f}'
  • 您也可以使用自定义函数,例如 lambda x: f'{x:+.3f}'。如果此函数返回 None,则不会添加额外的差异文本。
  • 您也可以使用 None 来从不生成额外的差异文本。

diff_atol instance-attribute

diff_atol: float

用于判断差异是否“显著”的绝对容差。

如果 abs(new - old) < self.diff_atol + self.diff_rtol * abs(old),则差异是“显著的”。

如果差异不显著,将不会应用差异样式。请注意,只要渲染前后的值不同,我们仍然会显示它们,即使差异不显著。(如果渲染后的值完全相同,我们只显示一次该值。)

如果未提供,则使用 1e-6。

diff_rtol instance-attribute

diff_rtol: float

用于判断差异是否“显著”的相对容差。

有关什么构成“显著”差异的更多详情,请参阅 diff_atol 的描述。

如果未提供,当所有值都是整数时使用 0.001,否则使用 0.05。

diff_increase_style instance-attribute

diff_increase_style: str

应用于有显著增加的差异值的样式。

有关什么构成“显著”差异的更多详情,请参阅 diff_atol 的描述。

如果未提供,分数使用绿色,指标使用红色。您也可以使用任意 rich 样式,例如 "bold red"。

diff_decrease_style instance-attribute

diff_decrease_style: str

应用于有显著减少的差异值的样式。

有关什么构成“显著”差异的更多详情,请参阅 diff_atol 的描述。

如果未提供,分数使用红色,指标使用绿色。您也可以使用任意 rich 样式,例如 "bold red"。

EvaluationRenderer dataclass

一个用于渲染 EvalReport 或两个 EvalReport 之间差异的类。

源代码位于 pydantic_evals/pydantic_evals/reporting/__init__.py
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
@dataclass(kw_only=True)
class EvaluationRenderer:
    """A class for rendering an EvalReport or the diff between two EvalReports."""

    # Columns to include
    include_input: bool
    include_metadata: bool
    include_expected_output: bool
    include_output: bool
    include_durations: bool
    include_total_duration: bool

    # Rows to include
    include_removed_cases: bool
    include_averages: bool

    input_config: RenderValueConfig
    metadata_config: RenderValueConfig
    output_config: RenderValueConfig
    score_configs: dict[str, RenderNumberConfig]
    label_configs: dict[str, RenderValueConfig]
    metric_configs: dict[str, RenderNumberConfig]
    duration_config: RenderNumberConfig

    # Data to include
    include_reasons: bool  # only applies to reports, not to diffs

    include_error_message: bool
    include_error_stacktrace: bool
    include_evaluator_failures: bool

    def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
        return any(case.scores for case in self._all_cases(report, baseline))

    def include_labels(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
        return any(case.labels for case in self._all_cases(report, baseline))

    def include_metrics(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
        return any(case.metrics for case in self._all_cases(report, baseline))

    def include_assertions(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
        return any(case.assertions for case in self._all_cases(report, baseline))

    def include_evaluator_failures_column(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
        return self.include_evaluator_failures and any(
            case.evaluator_failures for case in self._all_cases(report, baseline)
        )

    def _all_cases(self, report: EvaluationReport, baseline: EvaluationReport | None) -> list[ReportCase]:
        if not baseline:
            return report.cases
        else:
            return report.cases + self._baseline_cases_to_include(report, baseline)

    def _baseline_cases_to_include(self, report: EvaluationReport, baseline: EvaluationReport) -> list[ReportCase]:
        if self.include_removed_cases:
            return baseline.cases
        report_case_names = {case.name for case in report.cases}
        return [case for case in baseline.cases if case.name in report_case_names]

    def _get_case_renderer(
        self, report: EvaluationReport, baseline: EvaluationReport | None = None
    ) -> ReportCaseRenderer:
        input_renderer = _ValueRenderer.from_config(self.input_config)
        metadata_renderer = _ValueRenderer.from_config(self.metadata_config)
        output_renderer = _ValueRenderer.from_config(self.output_config)
        score_renderers = self._infer_score_renderers(report, baseline)
        label_renderers = self._infer_label_renderers(report, baseline)
        metric_renderers = self._infer_metric_renderers(report, baseline)
        duration_renderer = _NumberRenderer.infer_from_config(
            self.duration_config, 'duration', [x.task_duration for x in self._all_cases(report, baseline)]
        )

        return ReportCaseRenderer(
            include_input=self.include_input,
            include_metadata=self.include_metadata,
            include_expected_output=self.include_expected_output,
            include_output=self.include_output,
            include_scores=self.include_scores(report, baseline),
            include_labels=self.include_labels(report, baseline),
            include_metrics=self.include_metrics(report, baseline),
            include_assertions=self.include_assertions(report, baseline),
            include_reasons=self.include_reasons,
            include_durations=self.include_durations,
            include_total_duration=self.include_total_duration,
            include_error_message=self.include_error_message,
            include_error_stacktrace=self.include_error_stacktrace,
            include_evaluator_failures=self.include_evaluator_failures_column(report, baseline),
            input_renderer=input_renderer,
            metadata_renderer=metadata_renderer,
            output_renderer=output_renderer,
            score_renderers=score_renderers,
            label_renderers=label_renderers,
            metric_renderers=metric_renderers,
            duration_renderer=duration_renderer,
        )

    def build_table(self, report: EvaluationReport) -> Table:
        case_renderer = self._get_case_renderer(report)
        table = case_renderer.build_base_table(f'Evaluation Summary: {report.name}')
        for case in report.cases:
            table.add_row(*case_renderer.build_row(case))

        if self.include_averages:  # pragma: no branch
            average = report.averages()
            if average:  # pragma: no branch
                table.add_row(*case_renderer.build_aggregate_row(average))

        return table

    def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) -> Table:
        report_cases = report.cases
        baseline_cases = self._baseline_cases_to_include(report, baseline)

        report_cases_by_id = {case.name: case for case in report_cases}
        baseline_cases_by_id = {case.name: case for case in baseline_cases}

        diff_cases: list[tuple[ReportCase, ReportCase]] = []
        removed_cases: list[ReportCase] = []
        added_cases: list[ReportCase] = []

        for case_id in sorted(set(baseline_cases_by_id.keys()) | set(report_cases_by_id.keys())):
            maybe_baseline_case = baseline_cases_by_id.get(case_id)
            maybe_report_case = report_cases_by_id.get(case_id)
            if maybe_baseline_case and maybe_report_case:
                diff_cases.append((maybe_baseline_case, maybe_report_case))
            elif maybe_baseline_case:
                removed_cases.append(maybe_baseline_case)
            elif maybe_report_case:
                added_cases.append(maybe_report_case)
            else:  # pragma: no cover
                assert False, 'This should be unreachable'

        case_renderer = self._get_case_renderer(report, baseline)
        diff_name = baseline.name if baseline.name == report.name else f'{baseline.name}{report.name}'
        table = case_renderer.build_base_table(f'Evaluation Diff: {diff_name}')
        for baseline_case, new_case in diff_cases:
            table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
        for case in added_cases:
            row = case_renderer.build_row(case)
            row[0] = f'[green]+ Added Case[/]\n{row[0]}'
            table.add_row(*row)
        for case in removed_cases:
            row = case_renderer.build_row(case)
            row[0] = f'[red]- Removed Case[/]\n{row[0]}'
            table.add_row(*row)

        if self.include_averages:  # pragma: no branch
            report_average = ReportCaseAggregate.average(report_cases)
            baseline_average = ReportCaseAggregate.average(baseline_cases)
            table.add_row(*case_renderer.build_diff_aggregate_row(report_average, baseline_average))

        return table

    def build_failures_table(self, report: EvaluationReport) -> Table:
        case_renderer = self._get_case_renderer(report)
        table = case_renderer.build_failures_table('Case Failures')
        for case in report.failures:
            table.add_row(*case_renderer.build_failure_row(case))

        return table

    def _infer_score_renderers(
        self, report: EvaluationReport, baseline: EvaluationReport | None
    ) -> dict[str, _NumberRenderer]:
        all_cases = self._all_cases(report, baseline)

        values_by_name: dict[str, list[float | int]] = {}
        for case in all_cases:
            for k, score in case.scores.items():
                values_by_name.setdefault(k, []).append(score.value)

        all_renderers: dict[str, _NumberRenderer] = {}
        for name, values in values_by_name.items():
            merged_config = _DEFAULT_NUMBER_CONFIG.copy()
            merged_config.update(self.score_configs.get(name, {}))
            all_renderers[name] = _NumberRenderer.infer_from_config(merged_config, 'score', values)
        return all_renderers

    def _infer_label_renderers(
        self, report: EvaluationReport, baseline: EvaluationReport | None
    ) -> dict[str, _ValueRenderer]:
        all_cases = self._all_cases(report, baseline)
        all_names: set[str] = set()
        for case in all_cases:
            for k in case.labels:
                all_names.add(k)

        all_renderers: dict[str, _ValueRenderer] = {}
        for name in all_names:
            merged_config = _DEFAULT_VALUE_CONFIG.copy()
            merged_config.update(self.label_configs.get(name, {}))
            all_renderers[name] = _ValueRenderer.from_config(merged_config)
        return all_renderers

    def _infer_metric_renderers(
        self, report: EvaluationReport, baseline: EvaluationReport | None
    ) -> dict[str, _NumberRenderer]:
        all_cases = self._all_cases(report, baseline)

        values_by_name: dict[str, list[float | int]] = {}
        for case in all_cases:
            for k, v in case.metrics.items():
                values_by_name.setdefault(k, []).append(v)

        all_renderers: dict[str, _NumberRenderer] = {}
        for name, values in values_by_name.items():
            merged_config = _DEFAULT_NUMBER_CONFIG.copy()
            merged_config.update(self.metric_configs.get(name, {}))
            all_renderers[name] = _NumberRenderer.infer_from_config(merged_config, 'metric', values)
        return all_renderers

    def _infer_duration_renderer(
        self, report: EvaluationReport, baseline: EvaluationReport | None
    ) -> _NumberRenderer:  # pragma: no cover
        all_cases = self._all_cases(report, baseline)
        all_durations = [x.task_duration for x in all_cases]
        if self.include_total_duration:
            all_durations += [x.total_duration for x in all_cases]
        return _NumberRenderer.infer_from_config(self.duration_config, 'duration', all_durations)