from langchain.evaluation import load_evaluator
#レーベンシュタイン距離を用いた評価方法を定義
def evaluation(output, expected_output):
evaluator = load_evaluator("string_distance")
score = evaluator.evaluate_strings(
prediction=output,
reference=expected_output,
)
return score['score']
def run_experiment(experiment_name, system_prompt):
# データセットの取得
dataset = langfuse.get_dataset("capital_cities")
for item in dataset.items:
# アプリケーションによる予測の生成
completion, langfuse_generation = run_my_custom_llm_app(item.input, system_prompt)
item.link(langfuse_generation, experiment_name) # pass the observation/generation object or the id
langfuse_generation.score(
name="string_distance",
value=evaluation(completion, item.expected_output)
)
4. 評価の実行
いろいろな首都の聞き方をして、それぞれの正答率を評価します。
run_experiment(
"famous_city",
"The user will input countries, respond with the most famous city in this country"
)
run_experiment(
"directly_ask",
"What is the capital of the following country?"
)
run_experiment(
"asking_specifically",
"The user will input countries, respond with only the name of the capital"
)
run_experiment(
"asking_specifically_2nd_try",
"The user will input countries, respond with only the name of the capital. State only the name of the city."
)