Manual Evaluation Explorer¶

In [1]:

import

Copied!





import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib.colors import ListedColormap
from textwrap import wrap
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import sys
import os

# Add src to path for importing colors module
sys.path.append(os.path.abspath('../src'))
from reproscreener.colors import (
    CATEGORICAL_3_LIGHT_OPTION_1, CATEGORICAL_3_LIGHT_OPTION_2, CATEGORICAL_3_LIGHT_OPTION_3, 
    CATEGORICAL_3_LIGHT_OPTION_4, CATEGORICAL_3_LIGHT_OPTION_5, CATEGORICAL_2_LIGHT_OPTION_1, 
    CATEGORICAL_2_LIGHT_OPTION_2, CATEGORICAL_2_LIGHT_OPTION_3, CATEGORICAL_2_LIGHT_OPTION_4, 
    CATEGORICAL_2_LIGHT_OPTION_5, CATEGORICAL_3_DARK_OPTION_1, CATEGORICAL_3_DARK_OPTION_2, 
    CATEGORICAL_3_DARK_OPTION_3, CATEGORICAL_3_DARK_OPTION_4, CATEGORICAL_3_DARK_OPTION_5
)
all_columns = [
  "problem",
  "problem_agreement",
  "objective",
  "objective_agreement",
  "research_method",
  "research_method_agreement",
  "research_questions",
  "research_questions_agreement",
  "pseudocode",
  "pseudocode_agreement",
  "dataset",
  "dataset_agreement",
  "hypothesis",
  "hypothesis_agreement",
  "prediction",
  "prediction_agreement",
  "code_available",
  "code_available_agreement",
  "software_dependencies",
  "software_dependencies_agreement",
  "experiment_setup",
  "experiment_setup_agreement"
]
metric_columns = [col for col in all_columns if not col.endswith("_agreement")]

repro_manuscript_metrics = ["problem", "objective", "research_method", "research_questions", "dataset", "hypothesis", "prediction", "code_available", "experiment_setup"]
repro_manuscript_metrics_display_map = {
    "problem": "Research problem",
    "objective": "Objective/ Goal",
    "research_method": "Research method",
    "research_questions": "Research questions",
    "hypothesis": "Hypothesis",
    "prediction": "Prediction",
    "code_available": "Code available",
    "dataset": "Dataset",
    "experiment_setup": "Experimental setup",
    # "software_dependencies": "Software dependencies",
}
gundersen_metrics = ["problem", "objective", "research_method", "research_questions", "pseudocode", "training_data", "validation_data", "test_data", "hypothesis", "prediction", "method_source_code", "hardware_specifications","software_dependencies", "experiment_setup"]
gundersen_metrics_display_map = {
    "problem": "Research problem",
    "objective": "Objective/ Goal",
    "research_method": "Research method",
    "research_questions": "Research questions",
    "pseudocode": "Pseudocode",
    "training_data": "Training data",
    "validation_data": "Validation data",
    "test_data": "Test data",
    "hypothesis": "Hypothesis",
    "prediction": "Prediction",
    "method_source_code": "Method source code",
    "hardware_specifications": "Hardware specifications",
    "software_dependencies": "Software dependencies",
    "experiment_setup": "Experimental setup",
}
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib.colors import ListedColormap
from textwrap import wrap
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import sys
import os

# Add src to path for importing colors module
sys.path.append(os.path.abspath('../src'))
from reproscreener.colors import (
    CATEGORICAL_3_LIGHT_OPTION_1, CATEGORICAL_3_LIGHT_OPTION_2, CATEGORICAL_3_LIGHT_OPTION_3, 
    CATEGORICAL_3_LIGHT_OPTION_4, CATEGORICAL_3_LIGHT_OPTION_5, CATEGORICAL_2_LIGHT_OPTION_1, 
    CATEGORICAL_2_LIGHT_OPTION_2, CATEGORICAL_2_LIGHT_OPTION_3, CATEGORICAL_2_LIGHT_OPTION_4, 
    CATEGORICAL_2_LIGHT_OPTION_5, CATEGORICAL_3_DARK_OPTION_1, CATEGORICAL_3_DARK_OPTION_2, 
    CATEGORICAL_3_DARK_OPTION_3, CATEGORICAL_3_DARK_OPTION_4, CATEGORICAL_3_DARK_OPTION_5
)
all_columns = [
  "problem",
  "problem_agreement",
  "objective",
  "objective_agreement",
  "research_method",
  "research_method_agreement",
  "research_questions",
  "research_questions_agreement",
  "pseudocode",
  "pseudocode_agreement",
  "dataset",
  "dataset_agreement",
  "hypothesis",
  "hypothesis_agreement",
  "prediction",
  "prediction_agreement",
  "code_available",
  "code_available_agreement",
  "software_dependencies",
  "software_dependencies_agreement",
  "experiment_setup",
  "experiment_setup_agreement"
]
metric_columns = [col for col in all_columns if not col.endswith("_agreement")]

repro_manuscript_metrics = ["problem", "objective", "research_method", "research_questions", "dataset", "hypothesis", "prediction", "code_available", "experiment_setup"]
repro_manuscript_metrics_display_map = {
    "problem": "Research problem",
    "objective": "Objective/ Goal",
    "research_method": "Research method",
    "research_questions": "Research questions",
    "hypothesis": "Hypothesis",
    "prediction": "Prediction",
    "code_available": "Code available",
    "dataset": "Dataset",
    "experiment_setup": "Experimental setup",
    # "software_dependencies": "Software dependencies",
}
gundersen_metrics = ["problem", "objective", "research_method", "research_questions", "pseudocode", "training_data", "validation_data", "test_data", "hypothesis", "prediction", "method_source_code", "hardware_specifications","software_dependencies", "experiment_setup"]
gundersen_metrics_display_map = {
    "problem": "Research problem",
    "objective": "Objective/ Goal",
    "research_method": "Research method",
    "research_questions": "Research questions",
    "pseudocode": "Pseudocode",
    "training_data": "Training data",
    "validation_data": "Validation data",
    "test_data": "Test data",
    "hypothesis": "Hypothesis",
    "prediction": "Prediction",
    "method_source_code": "Method source code",
    "hardware_specifications": "Hardware specifications",
    "software_dependencies": "Software dependencies",
    "experiment_setup": "Experimental setup",
}

In [2]:

Copied!





import matplotlib.pyplot as plt

from highlight_text import fig_text
# font_dir = '/mnt/c/Users/adb/Desktop/stuff/fonts/'
# font_dir = '/Users/adb/Library/Mobile Documents/com~apple~CloudDocs/fonts/'
# font_name = 'IBMPlexSans-Regular.ttf'
# font_name = 'Inter-Bold.ttf'
# # font_name = 'FiraSans-Regular.ttf'
# from matplotlib import font_manager
# font_manager.fontManager.addfont('/Users/adb/Library/Fonts/AlegreyaSans-Medium.ttf')
# prop = font_manager.FontProperties(fname='/Users/adb/Library/Fonts/AlegreyaSans-Medium.ttf')
# plt.rcParams['font.sans-serif'] = prop.get_name()

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Inter','Helvetica','IBM Plex Sans', 'Inter', 'Fira Sans']
# plt.rcParams['font.weight'] = 'medium'
# plt.rcParams['font.size'] = 12
plt.rcParams['text.color'] = '#1E1E1E'

thesis_figures_dir = "../../USCthesis/figures/"
presentation_figures_dir = "../../qualifying_exam_slides/present/figures/"
current_figures_dir = "../reports/figures/"
import matplotlib.pyplot as plt

from highlight_text import fig_text
# font_dir = '/mnt/c/Users/adb/Desktop/stuff/fonts/'
# font_dir = '/Users/adb/Library/Mobile Documents/com~apple~CloudDocs/fonts/'
# font_name = 'IBMPlexSans-Regular.ttf'
# font_name = 'Inter-Bold.ttf'
# # font_name = 'FiraSans-Regular.ttf'
# from matplotlib import font_manager
# font_manager.fontManager.addfont('/Users/adb/Library/Fonts/AlegreyaSans-Medium.ttf')
# prop = font_manager.FontProperties(fname='/Users/adb/Library/Fonts/AlegreyaSans-Medium.ttf')
# plt.rcParams['font.sans-serif'] = prop.get_name()

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Inter','Helvetica','IBM Plex Sans', 'Inter', 'Fira Sans']
# plt.rcParams['font.weight'] = 'medium'
# plt.rcParams['font.size'] = 12
plt.rcParams['text.color'] = '#1E1E1E'

thesis_figures_dir = "../../USCthesis/figures/"
presentation_figures_dir = "../../qualifying_exam_slides/present/figures/"
current_figures_dir = "../reports/figures/"

In [3]:

Copied!





df_manuscript_manual = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/manuscript.csv")
df_manuscript_manual.set_index("paper_id", inplace=True)
df_manuscript_manual = df_manuscript_manual.drop(
    columns=["evaluation_type", "source_file"]
    + [col for col in df_manuscript_manual.columns if "_description" in col]
)
df_manuscript_manual = df_manuscript_manual.rename(columns={"code_available_in_article": "code_available"})
df_manuscript_manual.head()
df_manuscript_manual = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/manuscript.csv")
df_manuscript_manual.set_index("paper_id", inplace=True)
df_manuscript_manual = df_manuscript_manual.drop(
    columns=["evaluation_type", "source_file"]
    + [col for col in df_manuscript_manual.columns if "_description" in col]
)
df_manuscript_manual = df_manuscript_manual.rename(columns={"code_available_in_article": "code_available"})
df_manuscript_manual.head()

Out[3]:

	paper_url	notes	empirical_dataset	code_available	papers_with_code_link_available	papers_with_code_link_matches	result_replication_code_available	is_package	has_wrapper_scripts	hardware_specifications_provided	...	will_it_reproduce_desc	problem	objective	research_method	research_questions	pseudocode	dataset	hypothesis	prediction	experiment_setup
paper_id
1606.04671	https://arxiv.org/pdf/1606.04671.pdf	No code or data released	NaN	False	False	False	NaN	NaN	False	NaN	...	NaN	False	False	False	False	False	False	False	False	True
1903.09668	https://arxiv.org/pdf/1903.09668.pdf	No code or data released	NaN	False	False	False	NaN	NaN	False	NaN	...	NaN	False	False	False	False	True	False	False	False	True
1904.10554	https://arxiv.org/pdf/1904.10554.pdf	No code or data released	NaN	False	False	False	NaN	NaN	False	NaN	...	NaN	False	False	False	False	True	False	False	False	True
1908.05659	https://arxiv.org/pdf/1908.05659.pdf	90 page review, no experiments, mostly math	NaN	False	False	False	NaN	NaN	False	NaN	...	NaN	False	False	False	False	False	False	False	False	False
1909.00931	https://arxiv.org/pdf/1909.00931.pdf	NaN	NaN	True	True	True	NaN	NaN	False	True	...	NaN	True	False	False	False	True	True	True	False	True

5 rows × 28 columns

Reproscreener (regex) vs. Manual evaluations of full manuscripts¶

In [4]:

Copied!





df_manuscript_regex = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/repro_eval_tex.csv")
df_manuscript_regex = df_manuscript_regex.rename(columns={"method_source_code": "code_available", "id": "paper_id"})
df_manuscript_regex.set_index("paper_id", inplace=True)
df_manuscript_regex['dataset'] = df_manuscript_regex['training_data'].astype(bool) + df_manuscript_regex['test_data'].astype(bool) + df_manuscript_regex['validation_data'].astype(bool) + df_manuscript_regex['training_data'].astype(bool)
df_manuscript_regex = df_manuscript_regex.drop(columns=["index","training_data", "test_data", "validation_data", "title"])
df_manuscript_regex[metric_columns] = df_manuscript_regex[metric_columns].astype(bool)
df_manuscript_regex = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/repro_eval_tex.csv")
df_manuscript_regex = df_manuscript_regex.rename(columns={"method_source_code": "code_available", "id": "paper_id"})
df_manuscript_regex.set_index("paper_id", inplace=True)
df_manuscript_regex['dataset'] = df_manuscript_regex['training_data'].astype(bool) + df_manuscript_regex['test_data'].astype(bool) + df_manuscript_regex['validation_data'].astype(bool) + df_manuscript_regex['training_data'].astype(bool)
df_manuscript_regex = df_manuscript_regex.drop(columns=["index","training_data", "test_data", "validation_data", "title"])
df_manuscript_regex[metric_columns] = df_manuscript_regex[metric_columns].astype(bool)

In [5]:

Copied!





common_idx = df_manuscript_manual.index.intersection(df_manuscript_regex.index)
manual_bool = df_manuscript_manual.loc[common_idx, metric_columns].astype(bool)
regex_bool = df_manuscript_regex.loc[common_idx, metric_columns].astype(bool)

results_manuscript = {}
for manuscript_metric in metric_columns:
    manuscript_regex_vals = regex_bool[manuscript_metric]
    manuscript_manual_vals = manual_bool[manuscript_metric]

    results_manuscript[manuscript_metric] = {
        'regex_sum': int(manuscript_regex_vals.sum()),
        'manual_sum': int(manuscript_manual_vals.sum()),
        'regex_proportion': float(manuscript_regex_vals.mean()),
        'regex_manual_agreement': float((manuscript_regex_vals == manuscript_manual_vals).mean()),
        'manual_proportion': float(manuscript_manual_vals.mean()),
        'total_n': int(len(manuscript_regex_vals))
    }

manuscript_results_df = pd.DataFrame(results_manuscript).T

tab_decimal_manu = manuscript_results_df
tab_percent_manu = manuscript_results_df.copy()
tab_percent_manu['regex_proportion'] = tab_percent_manu['regex_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_manu['manual_proportion'] = tab_percent_manu['manual_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_manu['regex_manual_agreement'] = tab_percent_manu['regex_manual_agreement'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_manu
common_idx = df_manuscript_manual.index.intersection(df_manuscript_regex.index)
manual_bool = df_manuscript_manual.loc[common_idx, metric_columns].astype(bool)
regex_bool = df_manuscript_regex.loc[common_idx, metric_columns].astype(bool)

results_manuscript = {}
for manuscript_metric in metric_columns:
    manuscript_regex_vals = regex_bool[manuscript_metric]
    manuscript_manual_vals = manual_bool[manuscript_metric]

    results_manuscript[manuscript_metric] = {
        'regex_sum': int(manuscript_regex_vals.sum()),
        'manual_sum': int(manuscript_manual_vals.sum()),
        'regex_proportion': float(manuscript_regex_vals.mean()),
        'regex_manual_agreement': float((manuscript_regex_vals == manuscript_manual_vals).mean()),
        'manual_proportion': float(manuscript_manual_vals.mean()),
        'total_n': int(len(manuscript_regex_vals))
    }

manuscript_results_df = pd.DataFrame(results_manuscript).T

tab_decimal_manu = manuscript_results_df
tab_percent_manu = manuscript_results_df.copy()
tab_percent_manu['regex_proportion'] = tab_percent_manu['regex_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_manu['manual_proportion'] = tab_percent_manu['manual_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_manu['regex_manual_agreement'] = tab_percent_manu['regex_manual_agreement'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_manu

Out[5]:

	regex_sum	manual_sum	regex_proportion	regex_manual_agreement	manual_proportion	total_n
problem	44.0	15.0	88%	38%	30%	50.0
objective	39.0	4.0	78%	30%	8%	50.0
research_method	43.0	10.0	86%	30%	20%	50.0
research_questions	45.0	3.0	90%	16%	6%	50.0
pseudocode	7.0	22.0	14%	62%	44%	50.0
dataset	19.0	31.0	38%	48%	62%	50.0
hypothesis	21.0	8.0	42%	62%	16%	50.0
prediction	34.0	0.0	68%	32%	0%	50.0
code_available	23.0	22.0	46%	82%	44%	50.0
software_dependencies	0.0	14.0	0%	72%	28%	50.0
experiment_setup	14.0	37.0	28%	54%	74%	50.0

Reproscreener (regex) vs. Manual evaluations of abstracts¶

In [6]:

Copied!





# Load regex results for abstracts computed locally
df_abstract_regex = pd.read_csv("../reports/tables/abstract_regex_gs.csv")
df_abstract_regex = df_abstract_regex.set_index("paper_id")
# Ensure boolean dtype for metrics
available_cols = [c for c in metric_columns if c in df_abstract_regex.columns]
df_abstract_regex[available_cols] = df_abstract_regex[available_cols].astype(bool)
df_abstract_regex.head()
# Load regex results for abstracts computed locally
df_abstract_regex = pd.read_csv("../reports/tables/abstract_regex_gs.csv")
df_abstract_regex = df_abstract_regex.set_index("paper_id")
# Ensure boolean dtype for metrics
available_cols = [c for c in metric_columns if c in df_abstract_regex.columns]
df_abstract_regex[available_cols] = df_abstract_regex[available_cols].astype(bool)
df_abstract_regex.head()

Out[6]:

	problem	objective	research_method	research_questions	pseudocode	dataset	hypothesis	prediction	code_available	software_dependencies	experiment_setup
paper_id
1606.04671	False	False	False	False	False	False	False	False	False	False	False
1903.09668	False	False	True	True	False	False	False	True	False	False	False
1904.10554	False	False	False	True	False	False	False	False	False	False	False
1908.05659	False	False	False	True	False	False	False	False	False	False	False
1909.00931	False	False	True	True	False	False	False	False	False	False	False

In [7]:

Copied!





# Load GPT agreement for abstracts and derive manual columns from agreement
df_abs_gpt_agreement = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/agreement_gpt.csv")

# Clean up columns - exclude metadata and description columns
df_abs_gpt_agreement = df_abs_gpt_agreement.set_index("paper_id")
df_abs_gpt_agreement = df_abs_gpt_agreement.drop(
    columns=["evaluation_type", "source_file"]
    + [col for col in df_abs_gpt_agreement.columns if "_description" in col]
)

# Remove gpt_ prefix from column names
df_abs_gpt_agreement = df_abs_gpt_agreement.rename(
    columns={
        col: col.replace("gpt_", "")
        for col in df_abs_gpt_agreement.columns
        if col.startswith("gpt_")
    }
)

# Compute manual_<metric> using agreement flip rule
for abs_metric in metric_columns:
    abs_agreement_col = f"{abs_metric}_agreement"
    if abs_metric in df_abs_gpt_agreement.columns and abs_agreement_col in df_abs_gpt_agreement.columns:
        abs_gpt_vals = df_abs_gpt_agreement[abs_metric].astype(bool)
        abs_agreement_vals = df_abs_gpt_agreement[abs_agreement_col]
        abs_manual_vals = np.where(abs_agreement_vals == 1, abs_gpt_vals, ~abs_gpt_vals)
        df_abs_gpt_agreement[f"manual_{abs_metric}"] = abs_manual_vals.astype(bool)
# Load GPT agreement for abstracts and derive manual columns from agreement
df_abs_gpt_agreement = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/agreement_gpt.csv")

# Clean up columns - exclude metadata and description columns
df_abs_gpt_agreement = df_abs_gpt_agreement.set_index("paper_id")
df_abs_gpt_agreement = df_abs_gpt_agreement.drop(
    columns=["evaluation_type", "source_file"]
    + [col for col in df_abs_gpt_agreement.columns if "_description" in col]
)

# Remove gpt_ prefix from column names
df_abs_gpt_agreement = df_abs_gpt_agreement.rename(
    columns={
        col: col.replace("gpt_", "")
        for col in df_abs_gpt_agreement.columns
        if col.startswith("gpt_")
    }
)

# Compute manual_ using agreement flip rule
for abs_metric in metric_columns:
    abs_agreement_col = f"{abs_metric}_agreement"
    if abs_metric in df_abs_gpt_agreement.columns and abs_agreement_col in df_abs_gpt_agreement.columns:
        abs_gpt_vals = df_abs_gpt_agreement[abs_metric].astype(bool)
        abs_agreement_vals = df_abs_gpt_agreement[abs_agreement_col]
        abs_manual_vals = np.where(abs_agreement_vals == 1, abs_gpt_vals, ~abs_gpt_vals)
        df_abs_gpt_agreement[f"manual_{abs_metric}"] = abs_manual_vals.astype(bool)

In [8]:

Copied!





# Build manual abstract evaluations from df_abs_gpt_agreement manual_ columns
abs_manual_cols_map = {
    f"manual_{m}": m for m in metric_columns if f"manual_{m}" in df_abs_gpt_agreement.columns
}
df_abstract_manual = df_abs_gpt_agreement[list(abs_manual_cols_map.keys())].rename(columns=abs_manual_cols_map)
# Ensure boolean dtype
df_abstract_manual = df_abstract_manual.astype(bool)
# Build manual abstract evaluations from df_abs_gpt_agreement manual_ columns
abs_manual_cols_map = {
    f"manual_{m}": m for m in metric_columns if f"manual_{m}" in df_abs_gpt_agreement.columns
}
df_abstract_manual = df_abs_gpt_agreement[list(abs_manual_cols_map.keys())].rename(columns=abs_manual_cols_map)
# Ensure boolean dtype
df_abstract_manual = df_abstract_manual.astype(bool)

In [9]:

Copied!





# Align indices and compute agreement for abstracts
common_idx_abs = df_abstract_manual.index.intersection(df_abstract_regex.index)
manual_bool_abs = df_abstract_manual.loc[common_idx_abs]
regex_bool_abs = df_abstract_regex.loc[common_idx_abs]

# abstract_metrics = [m for m in metric_columns if m in manual_bool_abs.columns and m in regex_bool_abs.columns]

results_abs = {}
for metric in metric_columns:
    regex_vals = regex_bool_abs[metric].astype(bool)
    manual_vals = manual_bool_abs[metric].astype(bool)

    results_abs[metric] = {
        'regex_sum': int(regex_vals.sum()),
        'manual_sum': int(manual_vals.sum()),
        'regex_proportion': float(regex_vals.mean()),
        'regex_manual_agreement': float((regex_vals == manual_vals).mean()),
        'manual_proportion': float(manual_vals.mean()),
        'total_n': int(len(regex_vals)),
    }

abstract_results_regex_df = pd.DataFrame(results_abs).T

tab_decimal_abs = abstract_results_regex_df
tab_percent_abs = abstract_results_regex_df.copy()
tab_percent_abs['regex_proportion'] = tab_percent_abs['regex_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_abs['manual_proportion'] = tab_percent_abs['manual_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_abs['regex_manual_agreement'] = tab_percent_abs['regex_manual_agreement'].mul(100).round(0).astype(int).astype(str).add('%')

tab_percent_abs
# Align indices and compute agreement for abstracts
common_idx_abs = df_abstract_manual.index.intersection(df_abstract_regex.index)
manual_bool_abs = df_abstract_manual.loc[common_idx_abs]
regex_bool_abs = df_abstract_regex.loc[common_idx_abs]

# abstract_metrics = [m for m in metric_columns if m in manual_bool_abs.columns and m in regex_bool_abs.columns]

results_abs = {}
for metric in metric_columns:
    regex_vals = regex_bool_abs[metric].astype(bool)
    manual_vals = manual_bool_abs[metric].astype(bool)

    results_abs[metric] = {
        'regex_sum': int(regex_vals.sum()),
        'manual_sum': int(manual_vals.sum()),
        'regex_proportion': float(regex_vals.mean()),
        'regex_manual_agreement': float((regex_vals == manual_vals).mean()),
        'manual_proportion': float(manual_vals.mean()),
        'total_n': int(len(regex_vals)),
    }

abstract_results_regex_df = pd.DataFrame(results_abs).T

tab_decimal_abs = abstract_results_regex_df
tab_percent_abs = abstract_results_regex_df.copy()
tab_percent_abs['regex_proportion'] = tab_percent_abs['regex_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_abs['manual_proportion'] = tab_percent_abs['manual_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_abs['regex_manual_agreement'] = tab_percent_abs['regex_manual_agreement'].mul(100).round(0).astype(int).astype(str).add('%')

tab_percent_abs

Out[9]:

	regex_sum	manual_sum	regex_proportion	regex_manual_agreement	manual_proportion	total_n
problem	12.0	41.0	24%	38%	82%	50.0
objective	4.0	44.0	8%	20%	88%	50.0
research_method	20.0	22.0	40%	56%	44%	50.0
research_questions	13.0	4.0	26%	70%	8%	50.0
pseudocode	0.0	0.0	0%	100%	0%	50.0
dataset	1.0	6.0	2%	86%	12%	50.0
hypothesis	0.0	6.0	0%	88%	12%	50.0
prediction	4.0	9.0	8%	82%	18%	50.0
code_available	2.0	4.0	4%	96%	8%	50.0
software_dependencies	0.0	1.0	0%	98%	2%	50.0
experiment_setup	1.0	12.0	2%	74%	24%	50.0

GPT-4 vs. Manual evaluations of manuscript abstracts¶

In [10]:

Copied!





df_gpt_agreement_manu = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/agreement_gpt.csv")

# Clean up columns - exclude metadata and description columns
# Make "paper_id" the index
df_gpt_agreement_manu = df_gpt_agreement_manu.set_index("paper_id")

df_gpt_agreement_manu = df_gpt_agreement_manu.drop(
    columns=["evaluation_type", "source_file"]
    + [col for col in df_gpt_agreement_manu.columns if "_description" in col]
)

# Remove gpt_ prefix from column names
df_gpt_agreement_manu = df_gpt_agreement_manu.rename(
    columns={
        col: col.replace("gpt_", "")
        for col in df_gpt_agreement_manu.columns
        if col.startswith("gpt_")
    }
)

results = {}
for manu_metric in metric_columns:
    gpt_col = manu_metric
    agreement_col = f"{manu_metric}_agreement"

    if agreement_col in df_gpt_agreement_manu.columns:
        gpt_vals = df_gpt_agreement_manu[gpt_col].astype(bool)
        agreement_vals = df_gpt_agreement_manu[agreement_col]

        # Calculate revised manual evaluation: keep GPT when agreement=1, invert when agreement=0
        manual_vals_gpt_abs = np.where(agreement_vals == 1, gpt_vals, ~gpt_vals)

        # Add manual_vals_gpt_abs to the agreement_gpt DataFrame
        df_gpt_agreement_manu[f"manual_{manu_metric}"] = manual_vals_gpt_abs.astype(bool)

        results[manu_metric] = {
            'gpt_sum': gpt_vals.sum(),
            'manual_sum': manual_vals_gpt_abs.sum(),
            'gpt_proportion': gpt_vals.mean(),
            'gpt_manual_agreement': agreement_vals.mean(),
            'manual_proportion': manual_vals_gpt_abs.mean(),
            'total_n': len(gpt_vals)
        }
abstract_results_gpt4_df = pd.DataFrame(results).T

tab_decimal = abstract_results_gpt4_df
tab_percent = abstract_results_gpt4_df.copy()
tab_percent['gpt_proportion'] = tab_percent['gpt_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent['manual_proportion'] = tab_percent['manual_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent['gpt_manual_agreement'] = tab_percent['gpt_manual_agreement'].mul(100).round(0).astype(int).astype(str).add('%')

# tabs = mo.ui.tabs({"percent": tab_percent, "decimal": tab_decimal})
tab_percent
df_gpt_agreement_manu = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/agreement_gpt.csv")

# Clean up columns - exclude metadata and description columns
# Make "paper_id" the index
df_gpt_agreement_manu = df_gpt_agreement_manu.set_index("paper_id")

df_gpt_agreement_manu = df_gpt_agreement_manu.drop(
    columns=["evaluation_type", "source_file"]
    + [col for col in df_gpt_agreement_manu.columns if "_description" in col]
)

# Remove gpt_ prefix from column names
df_gpt_agreement_manu = df_gpt_agreement_manu.rename(
    columns={
        col: col.replace("gpt_", "")
        for col in df_gpt_agreement_manu.columns
        if col.startswith("gpt_")
    }
)

results = {}
for manu_metric in metric_columns:
    gpt_col = manu_metric
    agreement_col = f"{manu_metric}_agreement"

    if agreement_col in df_gpt_agreement_manu.columns:
        gpt_vals = df_gpt_agreement_manu[gpt_col].astype(bool)
        agreement_vals = df_gpt_agreement_manu[agreement_col]

        # Calculate revised manual evaluation: keep GPT when agreement=1, invert when agreement=0
        manual_vals_gpt_abs = np.where(agreement_vals == 1, gpt_vals, ~gpt_vals)

        # Add manual_vals_gpt_abs to the agreement_gpt DataFrame
        df_gpt_agreement_manu[f"manual_{manu_metric}"] = manual_vals_gpt_abs.astype(bool)

        results[manu_metric] = {
            'gpt_sum': gpt_vals.sum(),
            'manual_sum': manual_vals_gpt_abs.sum(),
            'gpt_proportion': gpt_vals.mean(),
            'gpt_manual_agreement': agreement_vals.mean(),
            'manual_proportion': manual_vals_gpt_abs.mean(),
            'total_n': len(gpt_vals)
        }
abstract_results_gpt4_df = pd.DataFrame(results).T

tab_decimal = abstract_results_gpt4_df
tab_percent = abstract_results_gpt4_df.copy()
tab_percent['gpt_proportion'] = tab_percent['gpt_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent['manual_proportion'] = tab_percent['manual_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent['gpt_manual_agreement'] = tab_percent['gpt_manual_agreement'].mul(100).round(0).astype(int).astype(str).add('%')

# tabs = mo.ui.tabs({"percent": tab_percent, "decimal": tab_decimal})
tab_percent

Out[10]:

	gpt_sum	manual_sum	gpt_proportion	gpt_manual_agreement	manual_proportion	total_n
problem	49.0	41.0	98%	80%	82%	50.0
objective	49.0	44.0	98%	86%	88%	50.0
research_method	47.0	22.0	94%	46%	44%	50.0
research_questions	4.0	4.0	8%	96%	8%	50.0
pseudocode	0.0	0.0	0%	100%	0%	50.0
dataset	14.0	6.0	28%	68%	12%	50.0
hypothesis	6.0	6.0	12%	88%	12%	50.0
prediction	25.0	9.0	50%	52%	18%	50.0
code_available	4.0	4.0	8%	100%	8%	50.0
software_dependencies	1.0	1.0	2%	100%	2%	50.0
experiment_setup	27.0	12.0	54%	46%	24%	50.0

In [11]:

Copied!





# Load LLaMA 3.2 abstract results
df_abstract_llama32 = pd.read_csv("../../llama3/outputs_json/20250829-235938/analysis_summary_reproscreener.csv")
df_abstract_llama32 = df_abstract_llama32.set_index("paper_id")
# Ensure boolean dtype for metrics present in this dataframe
available_cols_llama32 = [c for c in metric_columns if c in df_abstract_llama32.columns]
df_abstract_llama32[available_cols_llama32] = df_abstract_llama32[available_cols_llama32].astype(bool)
# df_abstract_llama32.apply(lambda x: x.value_counts())
common_idx_abs_llama32 = df_abstract_manual.index.intersection(df_abstract_llama32.index)
manual_bool_abs_llama32 = df_abstract_manual.loc[common_idx_abs_llama32]
llama32_bool_abs = df_abstract_llama32.loc[common_idx_abs_llama32]

metrics_llama32_shared = [
    m for m in metric_columns
    if m in manual_bool_abs_llama32.columns and m in llama32_bool_abs.columns
]

results_abs_llama32 = {}
for metric_llama32 in metrics_llama32_shared:
    llama32_vals = llama32_bool_abs[metric_llama32].astype(bool)
    manual_vals_llama32 = manual_bool_abs_llama32[metric_llama32].astype(bool)

    results_abs_llama32[metric_llama32] = {
        'llama32_sum': int(llama32_vals.sum()),
        'manual_sum_llama32': int(manual_vals_llama32.sum()),
        'llama32_proportion': float(llama32_vals.mean()),
        'llama32_manual_agreement': float((llama32_vals == manual_vals_llama32).mean()),
        'manual_proportion_llama32': float(manual_vals_llama32.mean()),
        'total_n_llama32': int(len(llama32_vals)),
    }

abstract_results_llama32_df = pd.DataFrame(results_abs_llama32).T

tab_decimal_abs_llama32 = abstract_results_llama32_df
tab_percent_abs_llama32 = abstract_results_llama32_df.copy()
tab_percent_abs_llama32['llama32_proportion'] = tab_percent_abs_llama32['llama32_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_abs_llama32['manual_proportion_llama32'] = tab_percent_abs_llama32['manual_proportion_llama32'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_abs_llama32['llama32_manual_agreement'] = tab_percent_abs_llama32['llama32_manual_agreement'].mul(100).round(0).astype(int).astype(str).add('%')

tab_percent_abs_llama32
# Load LLaMA 3.2 abstract results
df_abstract_llama32 = pd.read_csv("../../llama3/outputs_json/20250829-235938/analysis_summary_reproscreener.csv")
df_abstract_llama32 = df_abstract_llama32.set_index("paper_id")
# Ensure boolean dtype for metrics present in this dataframe
available_cols_llama32 = [c for c in metric_columns if c in df_abstract_llama32.columns]
df_abstract_llama32[available_cols_llama32] = df_abstract_llama32[available_cols_llama32].astype(bool)
# df_abstract_llama32.apply(lambda x: x.value_counts())
common_idx_abs_llama32 = df_abstract_manual.index.intersection(df_abstract_llama32.index)
manual_bool_abs_llama32 = df_abstract_manual.loc[common_idx_abs_llama32]
llama32_bool_abs = df_abstract_llama32.loc[common_idx_abs_llama32]

metrics_llama32_shared = [
    m for m in metric_columns
    if m in manual_bool_abs_llama32.columns and m in llama32_bool_abs.columns
]

results_abs_llama32 = {}
for metric_llama32 in metrics_llama32_shared:
    llama32_vals = llama32_bool_abs[metric_llama32].astype(bool)
    manual_vals_llama32 = manual_bool_abs_llama32[metric_llama32].astype(bool)

    results_abs_llama32[metric_llama32] = {
        'llama32_sum': int(llama32_vals.sum()),
        'manual_sum_llama32': int(manual_vals_llama32.sum()),
        'llama32_proportion': float(llama32_vals.mean()),
        'llama32_manual_agreement': float((llama32_vals == manual_vals_llama32).mean()),
        'manual_proportion_llama32': float(manual_vals_llama32.mean()),
        'total_n_llama32': int(len(llama32_vals)),
    }

abstract_results_llama32_df = pd.DataFrame(results_abs_llama32).T

tab_decimal_abs_llama32 = abstract_results_llama32_df
tab_percent_abs_llama32 = abstract_results_llama32_df.copy()
tab_percent_abs_llama32['llama32_proportion'] = tab_percent_abs_llama32['llama32_proportion'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_abs_llama32['manual_proportion_llama32'] = tab_percent_abs_llama32['manual_proportion_llama32'].mul(100).round(0).astype(int).astype(str).add('%')
tab_percent_abs_llama32['llama32_manual_agreement'] = tab_percent_abs_llama32['llama32_manual_agreement'].mul(100).round(0).astype(int).astype(str).add('%')

tab_percent_abs_llama32

Out[11]:

	llama32_sum	manual_sum_llama32	llama32_proportion	llama32_manual_agreement	manual_proportion_llama32	total_n_llama32
problem	41.0	41.0	82%	72%	82%	50.0
objective	14.0	44.0	28%	36%	88%	50.0
research_method	7.0	22.0	14%	54%	44%	50.0
research_questions	7.0	4.0	14%	86%	8%	50.0
pseudocode	0.0	0.0	0%	100%	0%	50.0
dataset	1.0	6.0	2%	90%	12%	50.0
hypothesis	2.0	6.0	4%	84%	12%	50.0
prediction	5.0	9.0	10%	80%	18%	50.0
code_available	3.0	4.0	6%	98%	8%	50.0
software_dependencies	0.0	1.0	0%	98%	2%	50.0
experiment_setup	0.0	12.0	0%	76%	24%	50.0

In [12]:

Copied!





def plot_heatmap_regex(metrics_display_map):
    # Filter to only include metrics in gundersen_metrics
    df_manuscript_regex_gundersen = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/repro_eval_tex.csv")
    df_manuscript_regex_gundersen = df_manuscript_regex_gundersen.rename(columns={"id": "paper_id"})
    df_manuscript_regex_gundersen.set_index("paper_id", inplace=True)
    df_manuscript_regex_gundersen = df_manuscript_regex_gundersen.drop(columns=["index", "title"])
    df_manuscript_regex_gundersen[gundersen_metrics] = df_manuscript_regex_gundersen[gundersen_metrics].astype(bool)

    available_gundersen_metrics = [m for m in gundersen_metrics if m in df_manuscript_regex.columns]
    
    # Metrics on rows, papers on columns
    heatmap_df = df_manuscript_regex[available_gundersen_metrics].astype(float).T
    heatmap_df.index = [gundersen_metrics_display_map.get(m, m.replace("_", " ").title()) for m in heatmap_df.index]

    # custom_cmap = ListedColormap(["#FFF0F0", "#E74C3C"])
    # custom_cmap = ListedColormap(["#DFF3E3", "#3D9963"])
    custom_cmap = ListedColormap(["#bae6ff", "#00539a"])

    fig, ax = plt.subplots(figsize=(14, 5), tight_layout={"pad": 1.5})

    # Black frame
    ax.axhline(y=0, color="k", linewidth=1.5)
    ax.axvline(x=0, color="k", linewidth=1.5)
    ax.axhline(y=heatmap_df.shape[0], color="k", linewidth=1.5)
    ax.axvline(x=heatmap_df.shape[1], color="k", linewidth=1.5)

    sns.heatmap(heatmap_df, cmap=custom_cmap, cbar=False, linewidths=1.5, ax=ax)

    ax.set_ylabel("Gundersen et al. metrics", fontsize=12)
    ax.set_xlabel("arXiv articles (n=50)",  rotation=360, loc="right")

    plt.subplots_adjust(top=0.95, left=0.15, right=0.95)
    plt.tight_layout()
    plt.tick_params(axis='both', size=0, pad=5)
    
    for tick in ax.get_xticklabels():
        tick.set_fontsize(11)
    for tick in ax.get_yticklabels():
        tick.set_fontsize(14)
    plt.savefig(current_figures_dir + "hm_manuscript_regex.png", dpi=2560, bbox_inches="tight")
    plt.savefig(thesis_figures_dir + "hm_manuscript_regex.png", dpi=2560, bbox_inches="tight")
    plt.savefig(presentation_figures_dir + "hm_manuscript_regex.png", dpi=2560, bbox_inches="tight")

    plt.show()

plot_heatmap_regex(gundersen_metrics_display_map)
def plot_heatmap_regex(metrics_display_map):
    # Filter to only include metrics in gundersen_metrics
    df_manuscript_regex_gundersen = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/repro_eval_tex.csv")
    df_manuscript_regex_gundersen = df_manuscript_regex_gundersen.rename(columns={"id": "paper_id"})
    df_manuscript_regex_gundersen.set_index("paper_id", inplace=True)
    df_manuscript_regex_gundersen = df_manuscript_regex_gundersen.drop(columns=["index", "title"])
    df_manuscript_regex_gundersen[gundersen_metrics] = df_manuscript_regex_gundersen[gundersen_metrics].astype(bool)

    available_gundersen_metrics = [m for m in gundersen_metrics if m in df_manuscript_regex.columns]
    
    # Metrics on rows, papers on columns
    heatmap_df = df_manuscript_regex[available_gundersen_metrics].astype(float).T
    heatmap_df.index = [gundersen_metrics_display_map.get(m, m.replace("_", " ").title()) for m in heatmap_df.index]

    # custom_cmap = ListedColormap(["#FFF0F0", "#E74C3C"])
    # custom_cmap = ListedColormap(["#DFF3E3", "#3D9963"])
    custom_cmap = ListedColormap(["#bae6ff", "#00539a"])

    fig, ax = plt.subplots(figsize=(14, 5), tight_layout={"pad": 1.5})

    # Black frame
    ax.axhline(y=0, color="k", linewidth=1.5)
    ax.axvline(x=0, color="k", linewidth=1.5)
    ax.axhline(y=heatmap_df.shape[0], color="k", linewidth=1.5)
    ax.axvline(x=heatmap_df.shape[1], color="k", linewidth=1.5)

    sns.heatmap(heatmap_df, cmap=custom_cmap, cbar=False, linewidths=1.5, ax=ax)

    ax.set_ylabel("Gundersen et al. metrics", fontsize=12)
    ax.set_xlabel("arXiv articles (n=50)",  rotation=360, loc="right")

    plt.subplots_adjust(top=0.95, left=0.15, right=0.95)
    plt.tight_layout()
    plt.tick_params(axis='both', size=0, pad=5)
    
    for tick in ax.get_xticklabels():
        tick.set_fontsize(11)
    for tick in ax.get_yticklabels():
        tick.set_fontsize(14)
    plt.savefig(current_figures_dir + "hm_manuscript_regex.png", dpi=2560, bbox_inches="tight")
    plt.savefig(thesis_figures_dir + "hm_manuscript_regex.png", dpi=2560, bbox_inches="tight")
    plt.savefig(presentation_figures_dir + "hm_manuscript_regex.png", dpi=2560, bbox_inches="tight")

    plt.show()

plot_heatmap_regex(gundersen_metrics_display_map)

No description has been provided for this image

In [13]:

Copied!





def plot_heatmap_llama(metrics_display_map):
    # Filter to only include metrics in repro_manuscript_metrics
    available_repro_metrics = [m for m in repro_manuscript_metrics if m in df_abstract_llama32.columns]
    
    # Metrics on rows, papers on columns
    heatmap_df = df_abstract_llama32[available_repro_metrics].astype(float).T
    heatmap_df.index = [repro_manuscript_metrics_display_map.get(m, m.replace("_", " ").title()) for m in heatmap_df.index]

    # Two-color scheme (empty, filled)
    # custom_cmap = ListedColormap(["#FFF0F0", "#E74C3C"])
    # custom_cmap = ListedColormap(["#DFF3E3", "#3D9963"])
    custom_cmap = ListedColormap(["#e8daff", "#6929c4"])

    fig, ax = plt.subplots(figsize=(12, 4), tight_layout={"pad": 1.5})

    # Black frame
    ax.axhline(y=0, color="k", linewidth=1.5)
    ax.axvline(x=0, color="k", linewidth=1.5)
    ax.axhline(y=heatmap_df.shape[0], color="k", linewidth=1.5)
    ax.axvline(x=heatmap_df.shape[1], color="k", linewidth=1.5)

    sns.heatmap(heatmap_df, cmap=custom_cmap, cbar=False, linewidths=1.5, ax=ax)

    ax.set_ylabel("ReproManuscriptMetrics", fontsize=12)
    ax.set_xlabel("arXiv articles (n=50)",  rotation=360, loc="right")
    
    plt.subplots_adjust(top=0.95, left=0.15, right=0.95)
    plt.tight_layout()
    plt.tick_params(axis='both', size=0, pad=5)
    
    for tick in ax.get_xticklabels():
        tick.set_fontsize(11)
    for tick in ax.get_yticklabels():
        tick.set_fontsize(14)
    plt.savefig(current_figures_dir + "hm_abstract_llama32.png", dpi=2560, bbox_inches="tight")
    plt.savefig(thesis_figures_dir + "hm_abstract_llama32.png", dpi=2560, bbox_inches="tight")
    plt.savefig(presentation_figures_dir + "hm_abstract_llama32.png", dpi=2560, bbox_inches="tight")

    plt.show()
# plot_heatmap_llama(repro_manuscript_metrics_display_map)
def plot_heatmap_llama(metrics_display_map):
    # Filter to only include metrics in repro_manuscript_metrics
    available_repro_metrics = [m for m in repro_manuscript_metrics if m in df_abstract_llama32.columns]
    
    # Metrics on rows, papers on columns
    heatmap_df = df_abstract_llama32[available_repro_metrics].astype(float).T
    heatmap_df.index = [repro_manuscript_metrics_display_map.get(m, m.replace("_", " ").title()) for m in heatmap_df.index]

    # Two-color scheme (empty, filled)
    # custom_cmap = ListedColormap(["#FFF0F0", "#E74C3C"])
    # custom_cmap = ListedColormap(["#DFF3E3", "#3D9963"])
    custom_cmap = ListedColormap(["#e8daff", "#6929c4"])

    fig, ax = plt.subplots(figsize=(12, 4), tight_layout={"pad": 1.5})

    # Black frame
    ax.axhline(y=0, color="k", linewidth=1.5)
    ax.axvline(x=0, color="k", linewidth=1.5)
    ax.axhline(y=heatmap_df.shape[0], color="k", linewidth=1.5)
    ax.axvline(x=heatmap_df.shape[1], color="k", linewidth=1.5)

    sns.heatmap(heatmap_df, cmap=custom_cmap, cbar=False, linewidths=1.5, ax=ax)

    ax.set_ylabel("ReproManuscriptMetrics", fontsize=12)
    ax.set_xlabel("arXiv articles (n=50)",  rotation=360, loc="right")
    
    plt.subplots_adjust(top=0.95, left=0.15, right=0.95)
    plt.tight_layout()
    plt.tick_params(axis='both', size=0, pad=5)
    
    for tick in ax.get_xticklabels():
        tick.set_fontsize(11)
    for tick in ax.get_yticklabels():
        tick.set_fontsize(14)
    plt.savefig(current_figures_dir + "hm_abstract_llama32.png", dpi=2560, bbox_inches="tight")
    plt.savefig(thesis_figures_dir + "hm_abstract_llama32.png", dpi=2560, bbox_inches="tight")
    plt.savefig(presentation_figures_dir + "hm_abstract_llama32.png", dpi=2560, bbox_inches="tight")

    plt.show()
# plot_heatmap_llama(repro_manuscript_metrics_display_map)

Abstract evaluation comparison¶

In [14]:

Copied!





# plot all 3 agreement metrics
abstract_results_regex_df.regex_manual_agreement
abstract_results_gpt4_df.gpt_manual_agreement
abstract_results_llama32_df.llama32_manual_agreement

merged_agreement_results = pd.DataFrame({
    'Regex': abstract_results_regex_df['regex_manual_agreement'],
    'GPT-4': abstract_results_gpt4_df['gpt_manual_agreement'],
    'LLaMA 3.2': abstract_results_llama32_df['llama32_manual_agreement']
})
merged_agreement_results.index.name = 'Metric'
merged_agreement_results_melt = merged_agreement_results.reset_index().melt(id_vars='Metric', var_name='Method', value_name='Agreement')
merged_agreement_results
# plot all 3 agreement metrics
abstract_results_regex_df.regex_manual_agreement
abstract_results_gpt4_df.gpt_manual_agreement
abstract_results_llama32_df.llama32_manual_agreement

merged_agreement_results = pd.DataFrame({
    'Regex': abstract_results_regex_df['regex_manual_agreement'],
    'GPT-4': abstract_results_gpt4_df['gpt_manual_agreement'],
    'LLaMA 3.2': abstract_results_llama32_df['llama32_manual_agreement']
})
merged_agreement_results.index.name = 'Metric'
merged_agreement_results_melt = merged_agreement_results.reset_index().melt(id_vars='Metric', var_name='Method', value_name='Agreement')
merged_agreement_results

Out[14]:

	Regex	GPT-4	LLaMA 3.2
Metric
problem	0.38	0.80	0.72
objective	0.20	0.86	0.36
research_method	0.56	0.46	0.54
research_questions	0.70	0.96	0.86
pseudocode	1.00	1.00	1.00
dataset	0.86	0.68	0.90
hypothesis	0.88	0.88	0.84
prediction	0.82	0.52	0.80
code_available	0.96	1.00	0.98
software_dependencies	0.98	1.00	0.98
experiment_setup	0.74	0.46	0.76

In [15]:

Copied!





merged_agreement_results_manual = pd.DataFrame({
    'Regex': tab_percent_abs['regex_manual_agreement'],
    'GPT-4': tab_percent['gpt_manual_agreement'],
    'LLaMA 3.2': tab_percent_abs_llama32['llama32_manual_agreement'],
    '% human eval. found (n=50)': tab_percent.manual_proportion,
    '# human eval. found (n=50)': tab_percent.manual_sum
})
merged_agreement_results_manual
merged_agreement_results_manual = pd.DataFrame({
    'Regex': tab_percent_abs['regex_manual_agreement'],
    'GPT-4': tab_percent['gpt_manual_agreement'],
    'LLaMA 3.2': tab_percent_abs_llama32['llama32_manual_agreement'],
    '% human eval. found (n=50)': tab_percent.manual_proportion,
    '# human eval. found (n=50)': tab_percent.manual_sum
})
merged_agreement_results_manual

Out[15]:

	Regex	GPT-4	LLaMA 3.2	% human eval. found (n=50)	# human eval. found (n=50)
problem	38%	80%	72%	82%	41.0
objective	20%	86%	36%	88%	44.0
research_method	56%	46%	54%	44%	22.0
research_questions	70%	96%	86%	8%	4.0
pseudocode	100%	100%	100%	0%	0.0
dataset	86%	68%	90%	12%	6.0
hypothesis	88%	88%	84%	12%	6.0
prediction	82%	52%	80%	18%	9.0
code_available	96%	100%	98%	8%	4.0
software_dependencies	98%	100%	98%	2%	1.0
experiment_setup	74%	46%	76%	24%	12.0

Initial gundersen heatmap (grouped)¶

In [16]:

Copied!





def plot_heatmap_regex_grouped(metrics_display_map, color_palette=CATEGORICAL_3_LIGHT_OPTION_5, values_cmap=ListedColormap(["#edf5ff", "#002d9c"])):
    # Group colors from provided palette
    colors_3group = {
        "Method": color_palette[0],
        "Data": color_palette[1],
        "Experiment": color_palette[2],
    }

    df_manuscript_regex_gundersen = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/repro_eval_tex.csv")
    df_manuscript_regex_gundersen = df_manuscript_regex_gundersen.rename(columns={"id": "paper_id"})
    df_manuscript_regex_gundersen.set_index("paper_id", inplace=True)
    df_manuscript_regex_gundersen = df_manuscript_regex_gundersen.drop(columns=["index", "title"])
    df_manuscript_regex_gundersen[gundersen_metrics] = df_manuscript_regex_gundersen[gundersen_metrics].astype(bool)

    available_gundersen_metrics = [m for m in gundersen_metrics if m in df_manuscript_regex_gundersen.columns]

    # Metric groups for Gundersen metrics
    gundersen_metric_groups = {
        "Method": ["problem", "objective", "research_method", "research_questions", "pseudocode"],
        "Data": ["training_data", "validation_data", "test_data"],
        "Experiment": ["hypothesis", "prediction", "method_source_code", "hardware_specifications", "software_dependencies", "experiment_setup"],
    }

    # Metrics on rows, papers on columns
    heatmap_df = df_manuscript_regex_gundersen[available_gundersen_metrics].astype(float).T
    heatmap_df.index = [gundersen_metrics_display_map.get(m, m.replace("_", " ").title()) for m in heatmap_df.index]

    fig, ax = plt.subplots(figsize=(12, 5), tight_layout={"pad": 1.5})

    # Black frame
    ax.axhline(y=0, color="k", linewidth=1.5)
    ax.axvline(x=0, color="k", linewidth=1.5)
    ax.axhline(y=heatmap_df.shape[0], color="k", linewidth=1.5)
    ax.axvline(x=heatmap_df.shape[1], color="k", linewidth=1.5)

    sns.heatmap(heatmap_df, cmap=values_cmap, cbar=False, linewidths=1.5, ax=ax)

    # Color code the y-axis labels by metric group
    for i, (metric_key, display_name) in enumerate(zip(available_gundersen_metrics, heatmap_df.index)):
        for group_name, group_metrics in gundersen_metric_groups.items():
            if metric_key in group_metrics:
                ax.get_yticklabels()[i].set_color(colors_3group[group_name])
                ax.get_yticklabels()[i].set_weight("medium")
                break

    # Axis label styles aligned with llama version
    ax.set_ylabel("Gundersen et al. metrics", fontsize=13, fontweight="medium", labelpad=15)
    ax.set_xlabel("arXiv identifier (n=50)", fontsize=11, fontweight="medium", rotation=360, loc="center", labelpad=10)

    # Style to match llama: hide spines and adjust ticks
    ax.spines[["top", "right", "bottom", "left"]].set_visible(False)
    ax.tick_params(axis="both", size=0, pad=5)

    for tick in ax.get_xticklabels():
        tick.set_fontsize(11)
    for tick in ax.get_yticklabels():
        tick.set_fontsize(13)

    legend_string = "<Factor: >" + " ".join([f"\n<{group}>" for group in gundersen_metric_groups.keys()])
    fig_text(
        x=.11,
        y=0.20,
        s= legend_string,
        highlight_textprops=[{"fontweight":"bold"}]
            + [{"color": colors_3group[g], "fontweight": "medium"} for g in gundersen_metric_groups.keys()],
        annotationbbox_kw={'frameon': True, 'pad': .4, 
                           'bboxprops': {'linewidth': .8}},
    )

    plt.subplots_adjust(top=0.95, left=0.15, right=0.85)

    plt.savefig(current_figures_dir + "hm_manuscript_regex_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)
    plt.savefig(thesis_figures_dir + "hm_manuscript_regex_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)
    plt.savefig(presentation_figures_dir + "hm_manuscript_regex_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)

    plt.show()
plot_heatmap_regex_grouped(
    gundersen_metrics_display_map,
    color_palette=["#8a3800","#005d5d", "#002d9c"],  # or any 3-color palette [m, d, e]
    values_cmap=ListedColormap(["#e8daff", "#6929c4"])  # for a purple theme
)
def plot_heatmap_regex_grouped(metrics_display_map, color_palette=CATEGORICAL_3_LIGHT_OPTION_5, values_cmap=ListedColormap(["#edf5ff", "#002d9c"])):
    # Group colors from provided palette
    colors_3group = {
        "Method": color_palette[0],
        "Data": color_palette[1],
        "Experiment": color_palette[2],
    }

    df_manuscript_regex_gundersen = pd.read_csv("https://huggingface.co/datasets/adbX/reproscreener_manual_evaluations/resolve/main/repro_eval_tex.csv")
    df_manuscript_regex_gundersen = df_manuscript_regex_gundersen.rename(columns={"id": "paper_id"})
    df_manuscript_regex_gundersen.set_index("paper_id", inplace=True)
    df_manuscript_regex_gundersen = df_manuscript_regex_gundersen.drop(columns=["index", "title"])
    df_manuscript_regex_gundersen[gundersen_metrics] = df_manuscript_regex_gundersen[gundersen_metrics].astype(bool)

    available_gundersen_metrics = [m for m in gundersen_metrics if m in df_manuscript_regex_gundersen.columns]

    # Metric groups for Gundersen metrics
    gundersen_metric_groups = {
        "Method": ["problem", "objective", "research_method", "research_questions", "pseudocode"],
        "Data": ["training_data", "validation_data", "test_data"],
        "Experiment": ["hypothesis", "prediction", "method_source_code", "hardware_specifications", "software_dependencies", "experiment_setup"],
    }

    # Metrics on rows, papers on columns
    heatmap_df = df_manuscript_regex_gundersen[available_gundersen_metrics].astype(float).T
    heatmap_df.index = [gundersen_metrics_display_map.get(m, m.replace("_", " ").title()) for m in heatmap_df.index]

    fig, ax = plt.subplots(figsize=(12, 5), tight_layout={"pad": 1.5})

    # Black frame
    ax.axhline(y=0, color="k", linewidth=1.5)
    ax.axvline(x=0, color="k", linewidth=1.5)
    ax.axhline(y=heatmap_df.shape[0], color="k", linewidth=1.5)
    ax.axvline(x=heatmap_df.shape[1], color="k", linewidth=1.5)

    sns.heatmap(heatmap_df, cmap=values_cmap, cbar=False, linewidths=1.5, ax=ax)

    # Color code the y-axis labels by metric group
    for i, (metric_key, display_name) in enumerate(zip(available_gundersen_metrics, heatmap_df.index)):
        for group_name, group_metrics in gundersen_metric_groups.items():
            if metric_key in group_metrics:
                ax.get_yticklabels()[i].set_color(colors_3group[group_name])
                ax.get_yticklabels()[i].set_weight("medium")
                break

    # Axis label styles aligned with llama version
    ax.set_ylabel("Gundersen et al. metrics", fontsize=13, fontweight="medium", labelpad=15)
    ax.set_xlabel("arXiv identifier (n=50)", fontsize=11, fontweight="medium", rotation=360, loc="center", labelpad=10)

    # Style to match llama: hide spines and adjust ticks
    ax.spines[["top", "right", "bottom", "left"]].set_visible(False)
    ax.tick_params(axis="both", size=0, pad=5)

    for tick in ax.get_xticklabels():
        tick.set_fontsize(11)
    for tick in ax.get_yticklabels():
        tick.set_fontsize(13)

    legend_string = "" + " ".join([f"\n<{group}>" for group in gundersen_metric_groups.keys()])
    fig_text(
        x=.11,
        y=0.20,
        s= legend_string,
        highlight_textprops=[{"fontweight":"bold"}]
            + [{"color": colors_3group[g], "fontweight": "medium"} for g in gundersen_metric_groups.keys()],
        annotationbbox_kw={'frameon': True, 'pad': .4, 
                           'bboxprops': {'linewidth': .8}},
    )

    plt.subplots_adjust(top=0.95, left=0.15, right=0.85)

    plt.savefig(current_figures_dir + "hm_manuscript_regex_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)
    plt.savefig(thesis_figures_dir + "hm_manuscript_regex_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)
    plt.savefig(presentation_figures_dir + "hm_manuscript_regex_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)

    plt.show()
plot_heatmap_regex_grouped(
    gundersen_metrics_display_map,
    color_palette=["#8a3800","#005d5d", "#002d9c"],  # or any 3-color palette [m, d, e]
    values_cmap=ListedColormap(["#e8daff", "#6929c4"])  # for a purple theme
)

LLama grouped heatmap¶

In [17]:

Copied!





def plot_heatmap_llama_grouped(metrics_display_map, color_palette=CATEGORICAL_2_LIGHT_OPTION_1):
    available_repro_metrics = [m for m in repro_manuscript_metrics if m in df_abstract_llama32.columns]
    
    # Define metric groups for ReproManuscript metrics
    repro_manuscript_metric_groups = {
        "Method": ["problem", "objective", "research_method", "research_questions"],
        "Experiment": ["hypothesis", "prediction", "code_available", "dataset", "experiment_setup"]
    }
    
    # Color mapping for groups using input palette
    colors_2group = {"Method": color_palette[0], "Experiment": color_palette[1]}
    
    # Metrics on rows, papers on columns
    heatmap_df = df_abstract_llama32[available_repro_metrics].astype(float).T
    heatmap_df.index = [repro_manuscript_metrics_display_map.get(m, m.replace("_", " ").title()) for m in heatmap_df.index]

    # Custom colormap
    custom_cmap = ListedColormap(["#FBDAE6", "#ee538b"])

    fig, ax = plt.subplots(figsize=(12, 4), tight_layout={"pad": 1.5})

    # Black frame
    ax.axhline(y=0, color="k", linewidth=1.5)
    ax.axvline(x=0, color="k", linewidth=1.5)
    ax.axhline(y=heatmap_df.shape[0], color="k", linewidth=1.5)
    ax.axvline(x=heatmap_df.shape[1], color="k", linewidth=1.5)

    sns.heatmap(heatmap_df, cmap=custom_cmap, cbar=False, linewidths=1.5, ax=ax)

    # Color code the y-axis labels by metric group
    for i, (metric_key, display_name) in enumerate(zip(available_repro_metrics, heatmap_df.index)):
        # Find which group this metric belongs to
        for group_name, group_metrics in repro_manuscript_metric_groups.items():
            if metric_key in group_metrics:
                color = colors_2group[group_name]
                ax.get_yticklabels()[i].set_color(color)
                ax.get_yticklabels()[i].set_weight('medium')
                break

    ax.set_ylabel("Repro Manuscript Metrics", fontsize=13, fontweight="medium", labelpad=15)
    ax.set_xlabel("arXiv identifier (n=50)", fontsize=11, fontweight="medium", rotation=360, loc="center", labelpad=10)
    ax.spines[["top", "right", "bottom", "left"]].set_visible(False)
    ax.tick_params(axis='both', size=0, pad=5)
    
    for tick in ax.get_xticklabels():
        tick.set_fontsize(11)
    for tick in ax.get_yticklabels():
        tick.set_fontsize(13)
    
    legend_string = "<Factor: >" + " ".join([f"\n<{group}>" for group in repro_manuscript_metric_groups.keys()])
    fig_text(
        x=.08,
        y=0.25,
        s= legend_string,
        highlight_textprops=[{"fontweight":"bold"}]
            + [{"color": colors_2group[g], "fontweight": "medium"} for g in repro_manuscript_metric_groups.keys()],
        annotationbbox_kw={'frameon': True, 'pad': .4, 
                           'bboxprops': {'linewidth': .8}},
    )
    
    plt.subplots_adjust(top=0.95, left=0.15, right=0.85)
    plt.tight_layout()
        
    plt.savefig(current_figures_dir + "hm_abstract_llama32_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)
    plt.savefig(thesis_figures_dir + "hm_abstract_llama32_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)
    plt.savefig(presentation_figures_dir + "hm_abstract_llama32_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)

    plt.show()
plot_heatmap_llama_grouped(repro_manuscript_metrics_display_map, color_palette=["#8a3800","#005d5d"])
def plot_heatmap_llama_grouped(metrics_display_map, color_palette=CATEGORICAL_2_LIGHT_OPTION_1):
    available_repro_metrics = [m for m in repro_manuscript_metrics if m in df_abstract_llama32.columns]
    
    # Define metric groups for ReproManuscript metrics
    repro_manuscript_metric_groups = {
        "Method": ["problem", "objective", "research_method", "research_questions"],
        "Experiment": ["hypothesis", "prediction", "code_available", "dataset", "experiment_setup"]
    }
    
    # Color mapping for groups using input palette
    colors_2group = {"Method": color_palette[0], "Experiment": color_palette[1]}
    
    # Metrics on rows, papers on columns
    heatmap_df = df_abstract_llama32[available_repro_metrics].astype(float).T
    heatmap_df.index = [repro_manuscript_metrics_display_map.get(m, m.replace("_", " ").title()) for m in heatmap_df.index]

    # Custom colormap
    custom_cmap = ListedColormap(["#FBDAE6", "#ee538b"])

    fig, ax = plt.subplots(figsize=(12, 4), tight_layout={"pad": 1.5})

    # Black frame
    ax.axhline(y=0, color="k", linewidth=1.5)
    ax.axvline(x=0, color="k", linewidth=1.5)
    ax.axhline(y=heatmap_df.shape[0], color="k", linewidth=1.5)
    ax.axvline(x=heatmap_df.shape[1], color="k", linewidth=1.5)

    sns.heatmap(heatmap_df, cmap=custom_cmap, cbar=False, linewidths=1.5, ax=ax)

    # Color code the y-axis labels by metric group
    for i, (metric_key, display_name) in enumerate(zip(available_repro_metrics, heatmap_df.index)):
        # Find which group this metric belongs to
        for group_name, group_metrics in repro_manuscript_metric_groups.items():
            if metric_key in group_metrics:
                color = colors_2group[group_name]
                ax.get_yticklabels()[i].set_color(color)
                ax.get_yticklabels()[i].set_weight('medium')
                break

    ax.set_ylabel("Repro Manuscript Metrics", fontsize=13, fontweight="medium", labelpad=15)
    ax.set_xlabel("arXiv identifier (n=50)", fontsize=11, fontweight="medium", rotation=360, loc="center", labelpad=10)
    ax.spines[["top", "right", "bottom", "left"]].set_visible(False)
    ax.tick_params(axis='both', size=0, pad=5)
    
    for tick in ax.get_xticklabels():
        tick.set_fontsize(11)
    for tick in ax.get_yticklabels():
        tick.set_fontsize(13)
    
    legend_string = "" + " ".join([f"\n<{group}>" for group in repro_manuscript_metric_groups.keys()])
    fig_text(
        x=.08,
        y=0.25,
        s= legend_string,
        highlight_textprops=[{"fontweight":"bold"}]
            + [{"color": colors_2group[g], "fontweight": "medium"} for g in repro_manuscript_metric_groups.keys()],
        annotationbbox_kw={'frameon': True, 'pad': .4, 
                           'bboxprops': {'linewidth': .8}},
    )
    
    plt.subplots_adjust(top=0.95, left=0.15, right=0.85)
    plt.tight_layout()
        
    plt.savefig(current_figures_dir + "hm_abstract_llama32_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)
    plt.savefig(thesis_figures_dir + "hm_abstract_llama32_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)
    plt.savefig(presentation_figures_dir + "hm_abstract_llama32_grouped.png", dpi=2560, bbox_inches="tight", pad_inches=0.3)

    plt.show()
plot_heatmap_llama_grouped(repro_manuscript_metrics_display_map, color_palette=["#8a3800","#005d5d"])

Agreement bar plots¶

In [18]:

Copied!





def plot_agreement_by_metric_mpl(metrics_display_map, methods, color_palette=CATEGORICAL_3_LIGHT_OPTION_4):
    df_sorted = merged_agreement_results_melt.sort_values(["Metric","Method"])
    colors = {method: color_palette[i] for i, method in enumerate(methods)}

    groups = [
        ["problem", "objective", "research_method", "research_questions"],
        ["dataset", "hypothesis", "prediction","code_available","experiment_setup"],
    ]
    width = 0.8 / len(methods)
    for idx, group in enumerate(groups, start=1):
        x = np.arange(len(group))
        fig, ax = plt.subplots(figsize=(6, 3))
        for i, m in enumerate(methods):
            sub = df_sorted[df_sorted["Method"] == m].set_index("Metric").reindex(group)
            ax.bar(x + i*width - (len(methods)-1)*width/2, sub["Agreement"].values, width, label=m, color=colors[m])

        ax.set_xticks(x)
        ax.set_xticklabels([ "\n".join(wrap(metrics_display_map[m], 12)) for m in group ])
        ax.set_ylim(0, 1.05)
        ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
        ax.set_yticklabels(['0', '20', '40', '60', '80', '100'])
        ax.set_ylabel('Agreement (%)', fontweight="medium", labelpad=10)
        ax.yaxis.grid(True, linestyle="-", linewidth=0.2, color="#1E1E1E", alpha=1)
        ax.tick_params(axis="x", size=0, pad=5)
        ax.spines[["top", "right", "bottom", "left"]].set_visible(False)
        ax.tick_params(axis="y", size=0, pad=5)

        legend_string = "<Model: >\n" + " ".join([f"<{group}>" for group in methods])
        fig_text(
        x=.65,
        y=.98,
        s= legend_string,
        highlight_textprops=[{"fontweight":"bold"}]
            + [{"color": colors[g], "fontweight": "medium"} for g in methods],
        annotationbbox_kw={'frameon': True, 'pad': .4, 
                           'bboxprops': {'linewidth': .0}},
        )

        plt.tight_layout()
        plt.savefig(current_figures_dir + f"agreements_on_abstracts_row{idx}.png", dpi=2560, bbox_inches="tight", pad_inches=0.0)
        plt.savefig(thesis_figures_dir + f"agreements_on_abstracts_row{idx}.png", dpi=2560, bbox_inches="tight", pad_inches=0.0)
        plt.savefig(presentation_figures_dir + f"agreements_on_abstracts_row{idx}.png", dpi=2560, bbox_inches="tight", pad_inches=0.0)
        plt.show()
        
method_order = ["LLaMA 3.2", "GPT-4", "Regex"]
plot_agreement_by_metric_mpl(repro_manuscript_metrics_display_map, method_order, color_palette=CATEGORICAL_3_LIGHT_OPTION_4)
def plot_agreement_by_metric_mpl(metrics_display_map, methods, color_palette=CATEGORICAL_3_LIGHT_OPTION_4):
    df_sorted = merged_agreement_results_melt.sort_values(["Metric","Method"])
    colors = {method: color_palette[i] for i, method in enumerate(methods)}

    groups = [
        ["problem", "objective", "research_method", "research_questions"],
        ["dataset", "hypothesis", "prediction","code_available","experiment_setup"],
    ]
    width = 0.8 / len(methods)
    for idx, group in enumerate(groups, start=1):
        x = np.arange(len(group))
        fig, ax = plt.subplots(figsize=(6, 3))
        for i, m in enumerate(methods):
            sub = df_sorted[df_sorted["Method"] == m].set_index("Metric").reindex(group)
            ax.bar(x + i*width - (len(methods)-1)*width/2, sub["Agreement"].values, width, label=m, color=colors[m])

        ax.set_xticks(x)
        ax.set_xticklabels([ "\n".join(wrap(metrics_display_map[m], 12)) for m in group ])
        ax.set_ylim(0, 1.05)
        ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
        ax.set_yticklabels(['0', '20', '40', '60', '80', '100'])
        ax.set_ylabel('Agreement (%)', fontweight="medium", labelpad=10)
        ax.yaxis.grid(True, linestyle="-", linewidth=0.2, color="#1E1E1E", alpha=1)
        ax.tick_params(axis="x", size=0, pad=5)
        ax.spines[["top", "right", "bottom", "left"]].set_visible(False)
        ax.tick_params(axis="y", size=0, pad=5)

        legend_string = "\n" + " ".join([f"<{group}>" for group in methods])
        fig_text(
        x=.65,
        y=.98,
        s= legend_string,
        highlight_textprops=[{"fontweight":"bold"}]
            + [{"color": colors[g], "fontweight": "medium"} for g in methods],
        annotationbbox_kw={'frameon': True, 'pad': .4, 
                           'bboxprops': {'linewidth': .0}},
        )

        plt.tight_layout()
        plt.savefig(current_figures_dir + f"agreements_on_abstracts_row{idx}.png", dpi=2560, bbox_inches="tight", pad_inches=0.0)
        plt.savefig(thesis_figures_dir + f"agreements_on_abstracts_row{idx}.png", dpi=2560, bbox_inches="tight", pad_inches=0.0)
        plt.savefig(presentation_figures_dir + f"agreements_on_abstracts_row{idx}.png", dpi=2560, bbox_inches="tight", pad_inches=0.0)
        plt.show()
        
method_order = ["LLaMA 3.2", "GPT-4", "Regex"]
plot_agreement_by_metric_mpl(repro_manuscript_metrics_display_map, method_order, color_palette=CATEGORICAL_3_LIGHT_OPTION_4)