python - 比较文件时出错: IndexError: list index out of range

标签 python offset

我有一个程序,可以逐行比较文件并通过读取两个文件夹(“黄金文件夹”和“预测文件夹”)来计算精度。

解压出来的文件是这样的:

T1  Task 5 19   nonlinear wave
T2  Task 5 29   nonlinear wave equations
T3  Task 15 29  wave equations
T4  Task 86 111 general analytical method
T5  Task 94 111 analytical method
T6  Task 199 213    minimum stages
T7  Task 268 287    efficient technique
T8  Task 268 298    efficient technique relatingto

还有黄金文件:

T1  Process 5 14    oxidation
T2  Material 69 84  Ti-based alloys
T3  Material 186 192    alloys
T4  Task 264 349    understand the role that composition has on the oxidation behavior of Ti-based alloys
T5  Process 312 321 oxidation
T6  Material 334 349    Ti-based alloys
T7  Material 400 415    Ti-based alloys
T8  Material 445 451    alloys
T9  Process 480 489 oxidation

问题是此代码生成此错误:

Traceback (most recent call last):
  File "C:\Users\chedi\Downloads\Semeval\eval.py", line 214, in <module>
    calculateMeasures(folder_gold, folder_pred, remove_anno)
  File "C:\Users\chedi\Downloads\Semeval\eval.py", line 31, in calculateMeasures
    res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
  File "C:\Users\chedi\Downloads\Semeval\eval.py", line 130, in normaliseAnnotations
    r_g_offs = r_g[1].split(" ")
IndexError: list index out of range

错误出现在第 130 行和提取文件的格式中,但它们的格式似乎相同:第一列和第二列用制表符分隔,偏移量用空格

    #!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not micro averaging correctly and changed it

import os
from sklearn.metrics import precision_recall_fscore_support
import sys

def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno = ""):
    '''
    Calculate P, R, F1, Macro F
    :param folder_gold: folder containing gold standard .ann files
    :param folder_pred: folder containing prediction .ann files
    :param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
    keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
    Note that for the later, false positive
    :return:
    '''

    flist_gold = os.listdir(folder_gold)
    res_all_gold = []
    res_all_pred = []
    targets = []

    for f in flist_gold:
        # ignoring non-.ann files, should there be any
        if not str(f).endswith(".ann"):
            continue
        f_gold = open(os.path.join(folder_gold, f), "r")
        try:
            f_pred = open(os.path.join(folder_pred, f), "r")
            res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
        except IOError:
            print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
            res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []

        res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)

        spans_all = set(spans_gold + spans_pred)

        for i, r in enumerate(spans_all):
            if r in spans_gold:
                target = res_gold[spans_gold.index(r)].split(" ")[0]
                res_all_gold.append(target)
                if not target in targets:
                    targets.append(target)
            else:
                # those are the false positives, contained in pred but not gold
                res_all_gold.append("NONE")

            if r in spans_pred:
                target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
                res_all_pred.append(target_pred)
            else:
                # those are the false negatives, contained in gold but not pred
                res_all_pred.append("NONE")


    #y_true, y_pred, labels, targets
    prec, recall, f1, support = precision_recall_fscore_support(
        res_all_gold, res_all_pred, labels=targets, average=None)
    # unpack the precision, recall, f1 and support
    metrics = {}
    for k, target in enumerate(targets):
        metrics[target] = {
            'precision': prec[k],
            'recall': recall[k],
            'f1-score': f1[k],
            'support': support[k]
        }

    # now micro-averaged
    if remove_anno != 'types':
        prec, recall, f1, s = precision_recall_fscore_support(
            res_all_gold, res_all_pred, labels=targets, average='micro')
        metrics['overall'] = {
            'precision': prec,
            'recall': recall,
            'f1-score': f1,
            'support': sum(support)
        }
    else:
        # just binary classification, nothing to average
        metrics['overall'] = metrics['KEYPHRASE-NOTYPES']

    print_report(metrics, targets)
    return metrics


def print_report(metrics, targets, digits=2):
    def _get_line(results, target, columns):
        line = [target]
        for column in columns[:-1]:
            line.append("{0:0.{1}f}".format(results[column], digits))
        line.append("%s" % results[columns[-1]])
        return line

    columns = ['precision', 'recall', 'f1-score', 'support']

    fmt = '%11s' + '%9s' * 4 + '\n'
    report = [fmt % tuple([''] + columns)]
    report.append('\n')
    for target in targets:
        results = metrics[target]
        line = _get_line(results, target, columns)
        report.append(fmt % tuple(line))
    report.append('\n')

    # overall
    line = _get_line(metrics['overall'], 'avg / total', columns)
    report.append(fmt % tuple(line))
    report.append('\n')

    print(''.join(report))


def normaliseAnnotations(file_anno, remove_anno):
    '''
    Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
    :param file_anno:
    :param remove_anno:
    :return:
    '''
    res_full_anno = []
    res_anno = []
    spans_anno = []
    rels_anno = []

    for l in file_anno:
        r_g = l.strip().split("\t")
        r_g_offs = r_g[1].split(" ")

        # remove relation instances if specified
        if remove_anno != "" and r_g_offs[0].endswith("-of"):
            continue

        res_full_anno.append(l.strip())
        # normalise relation instances by looking up entity spans for relation IDs
        if r_g_offs[0].endswith("-of"):
            arg1 = r_g_offs[1].replace("Arg1:", "")
            arg2 = r_g_offs[2].replace("Arg2:", "")
            for l in res_full_anno:
                r_g_tmp = l.strip().split("\t")
                if r_g_tmp[0] == arg1:
                    ent1 = r_g_tmp[1].replace(" ", "_")
                if r_g_tmp[0] == arg2:
                    ent2 = r_g_tmp[1].replace(" ", "_")

            spans_anno.append(" ".join([ent1, ent2]))
            res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
            rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))

        else:
            spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
            keytype = r_g[1]
            if remove_anno == "types":
                keytype = "KEYPHRASE-NOTYPES"
            res_anno.append(keytype)



    for r in rels_anno:
        r_offs = r.split(" ")
        # reorder hyponyms to start with smallest index
        if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]:  # 1, 2
            r = " ".join([r_offs[0], r_offs[2], r_offs[1]])

        # Check, in all other hyponym relations, if the synonymous entity with smallest index is used for them.
        # If not, change it so it is.
        if r_offs[0] == "Synonym-of":
            for r2 in rels_anno:
                r2_offs = r2.split(" ")
                if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
                    r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
                    rels_anno[rels_anno.index(r2)] = r_new

                if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
                    r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
                    rels_anno[rels_anno.index(r2)] = r_new

    rels_anno = list(set(rels_anno))

    res_full_anno_new = []
    res_anno_new = []
    spans_anno_new = []

    for r in res_full_anno:
        r_g = r.strip().split("\t")
        if r_g[0].startswith("R") or r_g[0] == "*":
            continue
        ind = res_full_anno.index(r)
        res_full_anno_new.append(r)
        res_anno_new.append(res_anno[ind])
        spans_anno_new.append(spans_anno[ind])

    for r in rels_anno:
        res_full_anno_new.append("R\t" + r)
        res_anno_new.append(r)
        spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))

    return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno


if __name__ == '__main__':
    folder_gold = "data/dev/"
    folder_pred = "data_pred/dev/"
    remove_anno = ""  # "", "rel" or "types"
    if len(sys.argv) >= 2:
        folder_gold = sys.argv[1]
    if len(sys.argv) >= 3:
        folder_pred = sys.argv[2]
    if len(sys.argv) == 4:
        remove_anno = sys.argv[3]

    calculateMeasures(folder_gold, folder_pred, remove_anno)

最佳答案

我自己没有文件,我尝试使用您提供的“黄金”文件,即:

T1      Process 5 14    oxidation
T2      Material 69 84  Ti-based alloys
T3      Material 186 192    alloys
T4      Task 264 349    understand the role that composition has on the oxidation behavior of Ti-based alloys
T5      Process 312 321 oxidation
T6      Material 334 349    Ti-based alloys
T7      Material 400 415    Ti-based alloys
T8      Material 445 451    alloys
T9      Process 480 489 oxidation

为了使程序能够正确运行并且不会在您提到的代码行中出现“列表索引超出范围”的错误,第一列(“Ts”)和第二列有一个制表符,其他列之间有一个空格。如果无法以这种方式设置正确的文件格式(例如,在前两列之间使用空格而不是制表符)将会出现该错误。事实上,行中到底发生了什么

r_g = l.strip('\n').split("\t")  

是首先删除行尾的换行符,然后按制表符分割该行。这意味着该行被分成两个元素,它们组成了列表r_g。在这种情况下,r_g_offs 可以正确计算,并且将包含一个元素列表,这些元素是除第一列之外的所有列。在某些情况下,这将在稍后使用,例如
spans_anno.append("".join([r_g_offs[1], r_g_offs[2]]))
仅举一例。

让我们看一下不起作用的情况,并尝试理解原因。 如果文件 .ann (gold) 不是这样格式化的:

T1\tProcess (tab between)

而是

T1 Process (space)

代码

r_g = l.strip('\n').split("\t")  

将生成一个仅包含一个元素而不是两个元素的列表,例如

r_g = ['T1 Process ...']

在本例中,r_g 只有一个元素,即元素 r_g[0],因此当尝试访问不存在的元素时 ( r_g[1]) 通过

r_g_offs = r_g[1].split()  

一个人会得到一个

IndexError: list index out of range

<小时/>

还有另一种情况可能会出现上述错误。
在文件末尾为空行的情况下,r_g = [''],这意味着r_g是一个只有一个元素的列表。现在,与之前的情况类似,当脚本执行 r_g_offs = r_g[1].split() 行时,将尝试访问 r_g[1],这不会不存在,因为在这种情况下列表中的唯一元素是 r_g[0] 并且您将收到“列表索引超出范围”错误。

<小时/>

我可以运行的代码:

#!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not
# micro averaging correctly and changed it

import os
from sklearn.metrics import precision_recall_fscore_support
import sys

def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno=""):
    '''
    Calculate P, R, F1, Macro F
    :param folder_gold: folder containing gold standard .ann files
    :param folder_pred: folder containing prediction .ann files
    :param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
    keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
    Note that for the later, false positive
    :return:
    '''

    flist_gold = os.listdir(folder_gold)
    res_all_gold = []
    res_all_pred = []
    targets = []

    for f in flist_gold:
        # ignoring non-.ann files, should there
        # be any
        if not str(f).endswith(".ann"):
            continue
        f_gold = open(os.path.join(folder_gold, f), "r")
        try:
            f_pred = open(os.path.join(folder_pred, f), "r")
            res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
        except IOError:
            print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
            res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []

        res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)

        spans_all = set(spans_gold + spans_pred)

        for i, r in enumerate(spans_all):
            if r in spans_gold:
                target = res_gold[spans_gold.index(r)].split(" ")[0]
                res_all_gold.append(target)
                if not target in targets:
                    targets.append(target)
            else:

                res_all_gold.append("NONE")

            if r in spans_pred:
                target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
                res_all_pred.append(target_pred)
            else:

                res_all_pred.append("NONE")

        #y_true, y_pred, labels, targets
        prec, recall, f1, support = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average=None)
        metrics = {}
        for k, target in enumerate(targets):
            metrics[target] = {
                'precision': prec[k],
                'recall': recall[k],
                'f1-score': f1[k],
                'support': support[k]
            }

        # now
        # micro-averaged
        if remove_anno != 'types':
            prec, recall, f1, s = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average='micro')
            metrics['overall'] = {
                'precision': prec,
                'recall': recall,
                'f1-score': f1,
                'support': sum(support)
            }
        else:
            # just
            # binary
            # classification,
            # nothing
            # to
            # average
            metrics['overall'] = metrics['KEYPHRASE-NOTYPES']

    print_report(metrics, targets)
    return metrics

def print_report(metrics, targets, digits=2):
    def _get_line(results, target, columns):
        line = [target]
        for column in columns[:-1]:
            line.append("{0:0.{1}f}".format(results[column], digits))
        line.append("%s" % results[columns[-1]])
        return line

    columns = ['precision', 'recall', 'f1-score', 'support']

    fmt = '%11s' + '%9s' * 4 + '\n'
    report = [fmt % tuple([''] + columns)]
    report.append('\n')
    for target in targets:
        results = metrics[target]
        line = _get_line(results, target, columns)
        report.append(fmt % tuple(line))
    report.append('\n')

    # overall
    line = _get_line(
    metrics['overall'], 'avg / total', columns)
    report.append(fmt % tuple(line))
    report.append('\n')

    print(''.join(report))

def normaliseAnnotations(file_anno, remove_anno):
    '''
    Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
    :param file_anno:
    :param remove_anno:
    :return:
    '''
    res_full_anno = []
    res_anno = []
    spans_anno = []
    rels_anno = []

    for l in file_anno:
        print(l)
        print(l.strip('\n'))
        r_g = l.strip('\n').split("\t")
        print(r_g)
        print(len(r_g))
        r_g_offs = r_g[1].split()
        print(r_g_offs)
        if remove_anno != "" and r_g_offs[0].endswith("-of"):
            continue

        res_full_anno.append(l.strip())

        if r_g_offs[0].endswith("-of"):
            arg1 = r_g_offs[1].replace("Arg1:", "")
            arg2 = r_g_offs[2].replace("Arg2:", "")
            for l in res_full_anno:
                r_g_tmp = l.strip().split("\t")
                if r_g_tmp[0] == arg1:
                    ent1 = r_g_tmp[1].replace(" ", "_")
                if r_g_tmp[0] == arg2:
                    ent2 = r_g_tmp[1].replace(" ", "_")

            spans_anno.append(" ".join([ent1, ent2]))
            res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
            rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))

        else:
            spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
            keytype = r_g[1]
            if remove_anno == "types":
                keytype = "KEYPHRASE-NOTYPES"
            res_anno.append(keytype)

    for r in rels_anno:
        r_offs = r.split(" ")
# reorder hyponyms to start with smallest index
# 1, 2
        if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]:
            r = " ".join([r_offs[0], r_offs[2], r_offs[1]])
        if r_offs[0] == "Synonym-of":
            for r2 in rels_anno:
                r2_offs = r2.split(" ")
                if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
                    r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
                    rels_anno[rels_anno.index(r2)] = r_new

                if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
                    r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
                    rels_anno[rels_anno.index(r2)] = r_new

    rels_anno = list(set(rels_anno))

    res_full_anno_new = []
    res_anno_new = []
    spans_anno_new = []

    for r in res_full_anno:
        r_g = r.strip().split("\t")
        if r_g[0].startswith("R") or r_g[0] == "*":
            continue
        ind = res_full_anno.index(r)
        res_full_anno_new.append(r)
        res_anno_new.append(res_anno[ind])
        spans_anno_new.append(spans_anno[ind])

    for r in rels_anno:
        res_full_anno_new.append("R\t" + r)
        res_anno_new.append(r)
        spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))

    return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno

if __name__ == '__main__':
    folder_gold = "data/dev/"
    folder_pred = "data_pred/dev/"
    remove_anno = ""  # "", "rel" or "types"
    if len(sys.argv) >= 2:
        folder_gold = sys.argv[1]
    if len(sys.argv) >= 3:
        folder_pred = sys.argv[2]

From the two cases shown above, we can conclude that the script is very sensible to how the file are formatted/written (tab, spaces and no empty line at the end), so care will be needed when producing those files and feeding them to the main script.

关于python - 比较文件时出错: IndexError: list index out of range,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41521891/

相关文章:

python - 复制.pdf、.html、.jpeg文件的查询方法

python - 反转numpy数组的最有效方法

ms-office - 如何在excel中写入偏移量的引用?

c++ - 在编译时确定结构成员字节偏移量?

Delphi:记录字段的偏移量

c++ - 从 ".exe"+ 偏移量读取内存?

python - GTK - Python Treeview 排序列数据(文件大小 - 字节数据)

python - (Python 3) 如何组合列表的产品?

python - 使用 Python 优化查找最近创建的前 10 个文件

PHP: undefined offset 为 0,但它就在那里(我认为)