Manual Evaluation Explorer
The manual evaluation datasets are available on Hugging Face - adbX/reproscreener_manual_evaluations.
Source code for docs/02_manual_eval_explorer.py
Tip: paste this code into an empty cell, and the marimo editor will create cells for you
import marimo
__generated_with = "0.14.8"
app = marimo.App(width="full")
@app.cell
def _():
import marimo as mo
return (mo,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""# Manual Evaluation Explorer""")
return
@app.cell(hide_code=True)
def _():
import pandas as pd
import numpy as np
return np, pd
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## Summary of GPT vs. Manual evaluations on preprint abstracts""")
return
@app.cell(hide_code=True)
def _(mo, np, pd):
df_agreement_gpt = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/agreement_gpt.csv")
# Clean up columns - exclude metadata and description columns
# Make "paper_id" the index
df_agreement_gpt = df_agreement_gpt.set_index("paper_id")
df_agreement_gpt = df_agreement_gpt.drop(
columns=["evaluation_type", "source_file"]
+ [col for col in df_agreement_gpt.columns if "_description" in col]
)
# Remove gpt_ prefix from column names
df_agreement_gpt = df_agreement_gpt.rename(
columns={
col: col.replace("gpt_", "")
for col in df_agreement_gpt.columns
if col.startswith("gpt_")
}
)
all_columns = df_agreement_gpt.columns.tolist()
metric_columns = [col for col in all_columns if not col.endswith("_agreement")]
results = {}
for metric in metric_columns:
gpt_col = metric
agreement_col = f"{metric}_agreement"
if agreement_col in df_agreement_gpt.columns:
gpt_vals = df_agreement_gpt[gpt_col].astype(bool)
agreement_vals = df_agreement_gpt[agreement_col]
# Calculate revised manual evaluation: keep GPT when agreement=1, invert when agreement=0
manual_vals = np.where(agreement_vals == 1, gpt_vals, ~gpt_vals)
# Add manual_vals to the agreement_gpt DataFrame
df_agreement_gpt[f"manual_{metric}"] = manual_vals.astype(bool)
results[metric] = {
'gpt_sum': gpt_vals.sum(),
'manual_sum': manual_vals.sum(),
'gpt_proportion': gpt_vals.mean(),
'manual_proportion': manual_vals.mean(),
'gpt_manual_agreement': agreement_vals.mean(),
'total_n': len(gpt_vals)
}
results_df = pd.DataFrame(results).T
tab_decimal = results_df
tab_percent = results_df.copy()
tab_percent['gpt_proportion'] = tab_percent['gpt_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent['manual_proportion'] = tab_percent['manual_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent['gpt_manual_agreement'] = tab_percent['gpt_manual_agreement'].mul(100).round(0).astype(int).astype(str).add('%')
tabs = mo.ui.tabs({"percent": tab_percent, "decimal": tab_decimal})
tabs
return df_agreement_gpt, metric_columns
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## Select a metric to view evaluation results for each paper""")
return
@app.cell(hide_code=True)
def _(metric_columns, mo):
dropdown = mo.ui.dropdown(metric_columns, value=metric_columns[0] if metric_columns else None)
dropdown
return (dropdown,)
@app.cell(hide_code=True)
def _(df_agreement_gpt, dropdown, pd):
if dropdown.value:
selected_metric = dropdown.value
selected_metric_df = pd.DataFrame({
'gpt': df_agreement_gpt[selected_metric].astype(bool),
'manual': df_agreement_gpt[f"manual_{selected_metric}"].astype(bool),
'agreement': df_agreement_gpt[f"{selected_metric}_agreement"],
})
selected_metric_df
else:
pd.DataFrame()
selected_metric_df
return
if __name__ == "__main__":
app.run()