python - 使用 Spacy 训练新模型

标签 python spacy

我正在使用 Spacy 来识别命名实体的模型,Update of the Recognition of the Named Entity文档为我提供了此代码来更新现有模型,此代码仅指定将用作基础的模型、存储位置和迭代次数

from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path

# training data
TRAIN_DATA = [
    ("', 137 S Wilmington Street Raleigh, NC 27601 919.239.4070\t, • Server: Brian 20/1 Guests: 8\t10/01/2018 1:11 PM 20014, L Chicken Arti Pizza\t10.99, Subtotal Tax\t10.99 0.91, Total\t11.90, Balance Due\t11.90, Gratuity Suggestions To Help:, 20% = 2.20 18% = 1.2L,, 115% = 1.65 |f ,9., '?", {"entities": [(3, 19, "ORG")]}),
    ("Carolina Ale House, G1enwood, 0516 Table 23 #Party 1 JORDYN M SvrCk: 27 7:42p 09/30/18, Separate checks: 3-of-7\t, 2 Carolina Hurrlca\t15.50, 1 Smoked Cheddar Burger\t9.79, Sub Total:\t25.29, Tax:\t2.08, Sub Total:\t27.37, 20X GRATUIT\t5.06, 09/30 10:36pTO TAI : 32\t, D i d you enjoy Every delicious Bite’? Come back to See us and bring your friends*, You are always Welcome at our, House>", {"entities": [(8, 18, "ORG")]}),
    (", P~ l-LMl NG *, PRIME STEAKHOUSE 8, WINE BAR, Kalelyh, nr 27612 919-571-6200, Sgj*1® IABIE 51\t6, UlER1 H SvrCk: 5 8:04p 10/02/18, 1\tBlueheny Lemon Drop, ^ Corona, 2\tCraft Beer 2 2 120 Tomahawk 1 Pork Chop 1 Scottish Salmon 4 Prime Dessert, 13.00, 35.00 14.50, 240.00, 40.00, 44.00 0.00, Sub Total: 386.50 „\tTax:\t31.89, 10/02 9:59pTOTAL :\t418.39, www.F1emingsSteakhouse.com ) rials'., Dine Rewards account not attached, Not a Dine Rewards member?, Join now at DINE-REWARDS.COM, ", {"entities": [(17, 35, "ORG")]}),
    ("Flying Saucer Draught Emporium, 328 Morgan Raleigh, NC, Server: Hope 10/30/7 Guests: 0, 10/04/2018 8:26 PM 20068, L10- Cocktail, 8.00, L10- Classic Daiquiri 1/2 Nacho Libre-r L10- Liqueur, L10- Baily’s Irish Cream L10- Rocks, Subtotal, Tax, Total, 5.50, 8.00, 21.50, 0.45, 21.95, Balance Due\t21., T»p: 3,zT., If you pay with debit card, your bank may hold additional funds temporarily. This is not a charge from Flying Saucer, www. beerknurd .com Taxi Taxi - 919.333.3333", {"entities": [(0, 30, "ORG")]}),
]


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)

def main(model='en_core_web_sm', output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


if __name__ == "__main__":
    plac.call(main('en_core_web_sm', Path.cwd(), 100))

执行代码后,他向我展示了这个错误,即使生成了新模型,我也找不到任何引用,但当我尝试它时,我只识别了用作训练的实体(TRAIN_DATA),我也应该已识别 Spacy 的基本模型“en_core_web_sm”的实体。

Traceback (most recent call last):
  File "train.py", line 105, in <module>
    plac.call(main('en_core_web_sm', Path.cwd(), 100))
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 324, in call
    parser = parser_from(obj)
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 133, in parser_from
    parser.populate_from(obj)
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 248, in populate_from
    self._set_func_argspec(func)
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 240, in _set_func_argspec
    self.argspec = getargspec(obj)
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 38, in getargspec
    str(callableobj))
TypeError: Could not determine the signature of None

最佳答案

我解决了这个删除问题。

if __name__ == "__main__":
    call(main('en_core_web_sm', Path.cwd(), 100))

关于python - 使用 Spacy 训练新模型,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/57203633/

相关文章:

python - 如何删除pyspeech windows语音识别命令?

python - Windows Python 多播上的错误 10049

python - 在 Python 中扩展一个数字 block

python - 从非结构化医疗文档中提取文本以进行 NLP

python - spacy安装错误(运行cythonize失败)

python - 使用 spacy 从文档中删除命名实体

python - 如何在 django admin 中使用自定义字段进行搜索

Python函数出现异常时返回1

python - textcat -> 不允许架构额外字段

python - 我想从 spacy 中的文本中提取文本值