simlingo/simlingo_training/eval_metrics.py at main · RenzKa/simlingo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import re
import argparse
import json
import numpy as np
import torch.nn as nn
import language_evaluation
from multiprocessing import Pool
from tqdm import tqdm

import sys
sys.path.append(".")
from utils.gpt_eval import *


class evaluation_suit():
    def __init__(self):
        self.language_eval = language_evaluation.CocoEvaluator(coco_types=["BLEU", "ROUGE_L", "CIDEr", "METEOR", "SPICE"])
        self.GPT = []
        self.accuracy = {"answer": [], "GT": []}
        self.language = {"answer": [], "GT": []}

    def eval_acc(self):
        scores = []
        for i in tqdm(range(len(self.accuracy["answer"]))):
            answer = self.accuracy["answer"][i]
            GT = self.accuracy["GT"][i]
            if answer == GT:
                scores.append(1.0)
            else:
                scores.append(0.0)

        scores = sum(scores) / len(scores)
        return scores

    def eval_chatGPT(self, data):
        with Pool(16) as p:  # Change the number based on your CPU cores
            scores_all = p.map(gpt_forward, data)


        scores = [x for x in scores_all if x != -1]
        delted = len(scores_all) - len(scores)
        print(f"Deleted {delted} invalid samples")
        scores = list(map(float, scores))

        scores = sum(scores) / len(scores)
        return scores

    def eval_language(self):
        """
        return the dict evaluation results
        """
        answer = self.language["answer"]
        GT = self.language["GT"]
        chunk_size = 500
        n_total = len(answer)

        # Fast path: single chunk
        if n_total <= chunk_size:
            results = self.language_eval.run_evaluation(answer, GT)
            return {f"val/{k}": float(v) for k, v in results.items()}

        # Split and combine afterwards (weighted by chunk size)
        results_accumulator = {}
        total_items = 0

        for i in range(0, n_total, chunk_size):
            answer_split = answer[i:i + chunk_size]
            GT_split = GT[i:i + chunk_size]
            chunk_len = len(answer_split)
            total_items += chunk_len

            results_gen = self.language_eval.run_evaluation(answer_split, GT_split)

            # Accumulate weighted sums
            for k, v in results_gen.items():
                results_accumulator[k] = results_accumulator.get(k, 0.0) + float(v) * chunk_len

        # Weighted mean over all items
        results_gen_dict = {f"val/{k}": v_sum / total_items for k, v_sum in results_accumulator.items()}
        return results_gen_dict


    def forward(self, answer, GT):
        answer = answer.replace('A: ', '')
        GT = GT.replace('A: ', '')
        answer = answer.replace(' <|im_end|>', '')
        GT = GT.replace(' <|im_end|>', '')
        self.accuracy["answer"].append(answer)
        self.accuracy["GT"].append(GT)
        self.GPT.append((answer, GT))
        self.language["GT"].append(GT)
        self.language["answer"].append(answer)
        # self.match["GPT"].append((answer, GT))


    def evaluation(self):
        print("evaluation start!")
        scores = {}
        try:
            print("accuracy evaluation")
            scores["accuracy"] = self.eval_acc()
        except:
            print("Error in accuracy evaluation")
            scores["accuracy"] = 0.0
        try:
            print("chatGPT evaluation")
            scores["chatgpt"] = self.eval_chatGPT(self.GPT)
        except:
            print("Error in chatGPT evaluation")
            scores["chatgpt"] = 0.0
        try:
            print("language evaluation")
            scores["language"] = self.eval_language()
        except:
            print("Error in language evaluation")
            scores["language"] = {}

        return scores

if __name__ == '__main__':
    # get args
    parser = argparse.ArgumentParser(description='Evaluation')
    parser.add_argument('--root_path1', type=str, default="outputs/simlingo/predictions/language_preds_cot_rank_0.json", help='path to prediction file')
    args = parser.parse_args()

    with open(args.root_path1, 'r') as f :
        pred_file = json.load(f)

    gt = [preds[1].replace('A: ','') for preds in pred_file]
    pred = [preds[0].replace('A: ','') for preds in pred_file]

    print("Number of predictions: ", len(pred))
    print("Number of ground truths: ", len(gt))

    evaluation = evaluation_suit()

    for i in range(len(pred)):
        evaluation.forward(pred[i], gt[i])

    output = evaluation.evaluation()

    print("accuracy score: ", output["accuracy"])
    print("chatgpt score: ", output["chatgpt"])
    print("language score: ", output["language"])

    # save the evaluation results
    save_path = args.root_path1.replace(".json", "_metrics_gpt-4o-2024-08-06.json")
    with open(save_path, 'w') as f:
        json.dump(output, f, indent=4)