In [1]:
from deeppavlov import configs, build_model
In [2]:
import fasttext as fastText
In [208]:
import pickle 
import re
import pymorphy2
import csv
In [3]:
ner_model = build_model(configs.ner.ner_rus, download=True)
2019-09-15 11:43:52.429 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 205: Starting new HTTP connection (1): files.deeppavlov.ai:80
2019-09-15 11:43:52.523 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 393: http://files.deeppavlov.ai:80 "GET /embeddings/lenta_lower_100.bin.md5 HTTP/1.1" 200 54
2019-09-15 11:43:54.550 INFO in 'deeppavlov.download'['download'] at line 115: Skipped http://files.deeppavlov.ai/embeddings/lenta_lower_100.bin download because of matching hashes
2019-09-15 11:43:54.552 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 205: Starting new HTTP connection (1): files.deeppavlov.ai:80
2019-09-15 11:43:54.667 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 393: http://files.deeppavlov.ai:80 "GET /deeppavlov_data/ner_rus_v2_cpu_compatible.tar.gz.md5 HTTP/1.1" 200 383
2019-09-15 11:43:54.695 INFO in 'deeppavlov.download'['download'] at line 115: Skipped http://files.deeppavlov.ai/deeppavlov_data/ner_rus_v2_cpu_compatible.tar.gz download because of matching hashes
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
2019-09-15 11:43:56.274 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 100: [loading vocabulary from /root/.deeppavlov/models/ner_rus/word.dict]
2019-09-15 11:43:56.320 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 100: [loading vocabulary from /root/.deeppavlov/models/ner_rus/tag.dict]
2019-09-15 11:43:56.324 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 100: [loading vocabulary from /root/.deeppavlov/models/ner_rus/char.dict]
2019-09-15 11:43:56.331 INFO in 'deeppavlov.models.embedders.fasttext_embedder'['fasttext_embedder'] at line 52: [loading fastText embeddings from `/root/.deeppavlov/downloads/embeddings/lenta_lower_100.bin`]

2019-09-15 11:44:01.754 WARNING in 'tensorflow'['deprecation_wrapper'] at line 119: From /base/DeepPavlov/deeppavlov/core/common/check_gpu.py:25: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

2019-09-15 11:44:01.755 WARNING in 'tensorflow'['deprecation_wrapper'] at line 119: From /base/DeepPavlov/deeppavlov/core/common/check_gpu.py:28: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

Using TensorFlow backend.
2019-09-15 11:44:01.911 WARNING in 'tensorflow'['deprecation_wrapper'] at line 119: From /base/DeepPavlov/deeppavlov/models/ner/network.py:105: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead.

2019-09-15 11:44:01.912 WARNING in 'tensorflow'['deprecation_wrapper'] at line 119: From /base/DeepPavlov/deeppavlov/models/ner/network.py:172: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

2019-09-15 11:44:01.920 WARNING in 'tensorflow'['deprecation_wrapper'] at line 119: From /base/DeepPavlov/deeppavlov/models/ner/network.py:112: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

2019-09-15 11:44:01.930 WARNING in 'tensorflow'['deprecation'] at line 323: From /base/DeepPavlov/deeppavlov/core/layers/tf_layers.py:416: conv2d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
2019-09-15 11:44:02.27 WARNING in 'tensorflow'['ag_logging'] at line 145: Entity <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18403ece48>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18403ece48>>: AssertionError: Bad argument number for Name: 3, expecting 4
2019-09-15 11:44:02.123 WARNING in 'tensorflow'['ag_logging'] at line 145: Entity <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>>: AssertionError: Bad argument number for Name: 3, expecting 4
2019-09-15 11:44:02.225 WARNING in 'tensorflow'['ag_logging'] at line 145: Entity <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>>: AssertionError: Bad argument number for Name: 3, expecting 4
WARNING: Entity <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18403ece48>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18403ece48>>: AssertionError: Bad argument number for Name: 3, expecting 4
WARNING: Entity <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>>: AssertionError: Bad argument number for Name: 3, expecting 4
WARNING: Entity <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>>: AssertionError: Bad argument number for Name: 3, expecting 4
2019-09-15 11:44:02.326 WARNING in 'tensorflow'['ag_logging'] at line 145: Entity <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>>: AssertionError: Bad argument number for Name: 3, expecting 4
2019-09-15 11:44:02.346 WARNING in 'tensorflow'['deprecation'] at line 506: From /base/DeepPavlov/deeppavlov/core/layers/tf_layers.py:949: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
2019-09-15 11:44:02.359 WARNING in 'tensorflow'['deprecation'] at line 323: From /base/DeepPavlov/deeppavlov/models/ner/network.py:214: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
2019-09-15 11:44:02.361 INFO in 'deeppavlov.core.layers.tf_layers'['tf_layers'] at line 757: 
Warning! tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell is used. It is okay for inference mode, but if you train your model with this cell it could NOT be used with tf.contrib.cudnn_rnn.CudnnLSTMCell later. 
2019-09-15 11:44:02.378 WARNING in 'tensorflow'['deprecation'] at line 323: From /base/DeepPavlov/deeppavlov/core/layers/tf_layers.py:729: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
2019-09-15 11:44:02.382 WARNING in 'tensorflow'['deprecation_wrapper'] at line 119: From /base/DeepPavlov/deeppavlov/core/layers/tf_layers.py:733: The name tf.nn.rnn_cell.LSTMStateTuple is deprecated. Please use tf.compat.v1.nn.rnn_cell.LSTMStateTuple instead.

2019-09-15 11:44:02.383 WARNING in 'tensorflow'['deprecation'] at line 323: From /base/DeepPavlov/deeppavlov/core/layers/tf_layers.py:736: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
2019-09-15 11:44:02.461 WARNING in 'tensorflow'['ag_logging'] at line 145: Entity <bound method MultiRNNCell.call of <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7f1840194588>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method MultiRNNCell.call of <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7f1840194588>>: AttributeError: module 'gast' has no attribute 'Num'
2019-09-15 11:44:02.463 WARNING in 'tensorflow'['deprecation'] at line 506: From /usr/local/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
WARNING: Entity <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Conv.call of <tensorflow.python.layers.convolutional.Conv2D object at 0x7f18d94621d0>>: AssertionError: Bad argument number for Name: 3, expecting 4
WARNING: Entity <bound method MultiRNNCell.call of <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7f1840194588>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method MultiRNNCell.call of <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7f1840194588>>: AttributeError: module 'gast' has no attribute 'Num'
2019-09-15 11:44:02.556 WARNING in 'tensorflow'['ag_logging'] at line 145: Entity <bound method LSTMBlockCell.call of <tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops.CudnnCompatibleLSTMCell object at 0x7f1840230438>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMBlockCell.call of <tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops.CudnnCompatibleLSTMCell object at 0x7f1840230438>>: AssertionError: Bad argument number for Name: 3, expecting 4
2019-09-15 11:44:02.582 WARNING in 'tensorflow'['deprecation'] at line 506: From /base/DeepPavlov/deeppavlov/core/layers/tf_layers.py:862: calling reverse_sequence (from tensorflow.python.ops.array_ops) with seq_dim is deprecated and will be removed in a future version.
Instructions for updating:
seq_dim is deprecated, use seq_axis instead
2019-09-15 11:44:02.583 WARNING in 'tensorflow'['deprecation'] at line 506: From /usr/local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling reverse_sequence (from tensorflow.python.ops.array_ops) with batch_dim is deprecated and will be removed in a future version.
Instructions for updating:
batch_dim is deprecated, use batch_axis instead
2019-09-15 11:44:02.586 INFO in 'deeppavlov.core.layers.tf_layers'['tf_layers'] at line 757: 
Warning! tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell is used. It is okay for inference mode, but if you train your model with this cell it could NOT be used with tf.contrib.cudnn_rnn.CudnnLSTMCell later. 
2019-09-15 11:44:02.658 WARNING in 'tensorflow'['ag_logging'] at line 145: Entity <bound method MultiRNNCell.call of <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7f18400a6a90>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method MultiRNNCell.call of <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7f18400a6a90>>: AttributeError: module 'gast' has no attribute 'Num'
WARNING: Entity <bound method LSTMBlockCell.call of <tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops.CudnnCompatibleLSTMCell object at 0x7f1840230438>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMBlockCell.call of <tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops.CudnnCompatibleLSTMCell object at 0x7f1840230438>>: AssertionError: Bad argument number for Name: 3, expecting 4
WARNING: Entity <bound method MultiRNNCell.call of <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7f18400a6a90>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method MultiRNNCell.call of <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7f18400a6a90>>: AttributeError: module 'gast' has no attribute 'Num'
2019-09-15 11:44:02.770 WARNING in 'tensorflow'['ag_logging'] at line 145: Entity <bound method LSTMBlockCell.call of <tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops.CudnnCompatibleLSTMCell object at 0x7f1840230978>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMBlockCell.call of <tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops.CudnnCompatibleLSTMCell object at 0x7f1840230978>>: AssertionError: Bad argument number for Name: 3, expecting 4
2019-09-15 11:44:02.832 WARNING in 'tensorflow'['deprecation'] at line 323: From /base/DeepPavlov/deeppavlov/models/ner/network.py:251: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
2019-09-15 11:44:02.963 WARNING in 'tensorflow'['ag_logging'] at line 145: Entity <bound method Dense.call of <tensorflow.python.layers.core.Dense object at 0x7f18c00c2cc0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Dense.call of <tensorflow.python.layers.core.Dense object at 0x7f18c00c2cc0>>: AssertionError: Bad argument number for Name: 3, expecting 4
WARNING: Entity <bound method LSTMBlockCell.call of <tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops.CudnnCompatibleLSTMCell object at 0x7f1840230978>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMBlockCell.call of <tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops.CudnnCompatibleLSTMCell object at 0x7f1840230978>>: AssertionError: Bad argument number for Name: 3, expecting 4
WARNING: Entity <bound method Dense.call of <tensorflow.python.layers.core.Dense object at 0x7f18c00c2cc0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Dense.call of <tensorflow.python.layers.core.Dense object at 0x7f18c00c2cc0>>: AssertionError: Bad argument number for Name: 3, expecting 4
2019-09-15 11:44:03.21 WARNING in 'tensorflow'['deprecation'] at line 323: From /usr/local/lib/python3.6/site-packages/tensorflow/contrib/crf/python/ops/crf.py:99: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
2019-09-15 11:44:03.195 WARNING in 'tensorflow'['deprecation_wrapper'] at line 119: From /base/DeepPavlov/deeppavlov/models/ner/network.py:273: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.

2019-09-15 11:44:04.617 WARNING in 'tensorflow'['deprecation'] at line 323: From /base/DeepPavlov/deeppavlov/core/models/tf_model.py:42: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
2019-09-15 11:44:04.619 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 43: [loading model from /root/.deeppavlov/models/ner_rus/model]
2019-09-15 11:44:04.620 WARNING in 'tensorflow'['deprecation_wrapper'] at line 119: From /base/DeepPavlov/deeppavlov/core/models/tf_model.py:46: The name tf.train.Saver is deprecated. Please use tf.compat.v1.train.Saver instead.

2019-09-15 11:44:04.642 INFO in 'tensorflow'['saver'] at line 1280: Restoring parameters from /root/.deeppavlov/models/ner_rus/model
In [185]:
file = open('Мусоргский.txt', 'r')
file_lines = file.readlines()
lines = []

morph = pymorphy2.MorphAnalyzer()
for line in file_lines:
    line_norm = ''
    for word in line.split():
        line_norm = line_norm + ' ' + morph.parse(word)[0].normal_form
    lines.append(line_norm)
2019-09-15 15:13:35.724 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 16: Loading dictionaries from /usr/local/lib/python3.6/site-packages/pymorphy2_dicts-2.4.393442.3710985-py3.6.egg/pymorphy2_dicts/data
2019-09-15 15:13:35.808 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 20: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
In [186]:
def ppickle(item, file_name):
    file = open(file_name, 'wb') 
    pickle.dump(item, file)
In [187]:
def unpickle(file_name):
    file = open(file_name, 'rb') 
    return pickle.load(file)
In [188]:
def append(item, items):
    if item != '':
        items.append(item)
    return items
In [189]:
ner_output = []
locs = []; pers = []; orgs = []
for line in file_lines:
    ner = ner_model([line])
    ner_output.append(ner)
    words = ner[0][0]
    if len(words) != 0:
        tags = ner[1][0]
        loc = ''; per = ''; org = ''

        for i in range(len(tags)):
            w = morph.parse(words[i])[0].normal_form
            
            if tags[i] == 'B-LOC':
                locs = append(loc, locs)
                loc = w
                pers = append(per, pers)
                orgs = append(org, orgs)
            if tags[i] == 'I-LOC':
                loc = loc + ' ' + w

            if tags[i] == 'B-PER':
                locs = append(loc, locs)
                pers = append(per, pers)
                per = w
                orgs = append(org, orgs)
            if tags[i] == 'I-PER':
                per = per + ' ' + w

            if tags[i] == 'B-ORG':
                locs = append(loc, locs)
                pers = append(per, pers)
                orgs = append(org, orgs)
                org = w
            if tags[i] == 'I-ORG':
                org = org + ' ' + w
            
            if tags[i] == 'O':
                if loc != '':
                    locs.append(loc)
                    loc = ''
                if per != '':
                    pers.append(per)
                    per = ''
                if org != '':
                    orgs.append(org)
                    org = ''
In [190]:
# pickle everything
ppickle(ner_output, 'ner_otput')
ppickle(pers, 'persons')
ppickle(orgs, 'organizations')
ppickle(locs, 'locations')
In [191]:
# merge locs and pers, remove duplicates
locs_pers = locs + pers
locs_pers = list(dict.fromkeys(locs_pers))
In [194]:
# count 'O' tags for words from locs_pers
other_count = {}
for i in locs_pers:
    other_count[i] = 0
    
for j in range(len(file_lines)):
    ner = ner_model([file_lines[j]])
    line = lines[j]
    words = ner[0][0]
    if len(words) != 0:
        tags = ner[1][0]
        for item in locs_pers:
            spl = item.split()
            indices = [(i, i+len(spl)) for i in range(len(words)) if words[i:i+len(spl)] == spl] # [(0, 2), (3, 5)]
            if i != []:
                for i in indices:
                    if tags[i[0]] == 'O':
                        other_count[item] += 1
In [195]:
# count'PER' tags for words from locs_pers
persons_count = {}
for item in locs_pers:
    persons_count[item] = pers.count(item)
In [196]:
# count'LOC' tags for words from locs_pers
locations_count = {}
for item in locs_pers:
    locations_count[item] = locs.count(item)
In [197]:
# count'ORG' tags for words from locs_pers
organiztions_count = {}
for item in locs_pers:
    organiztions_count[item] = orgs.count(item)
In [198]:
persons_final = {}
locations_final = {}
In [199]:
# create final lists of locations and persons
for item in locs_pers:
    if locations_count[item] > persons_count[item] + organiztions_count[item] + other_count[item]:
        locations_final[item] = {'LOC': locations_count[item], 'PER': persons_count[item], 
                                 'ORG': organiztions_count[item], 'O': other_count[item]}
    else:
        if persons_count[item] > locations_count[item] + organiztions_count[item] + other_count[item]:
            persons_final[item] = {'PER': persons_count[item], 'LOC': locations_count[item],
                                 'ORG': organiztions_count[item], 'O': other_count[item]}
In [264]:
# correct tags
corrections = []
corr_locs = 0
for ner in ner_output:
    sentence = ner[0][0]
    words = []
    for i in range(len(sentence)):
        words.append(morph.parse(sentence[i])[0].normal_form)
    if len(words) != 0:
        tags = ner[1][0]
        for item in locations_final:
            spl = item.split()
            indices = [(i, i+len(spl)) for i in range(len(words)) if words[i:i+len(spl)] == spl] # [(0, 2), (3, 5)]
            if i != []:
                for i in indices:
                    if tags[i[0]] != 'B-LOC' and  tags[i[0]] != 'I-LOC':
                        corr_locs += 1
                        corrections.append('Named entity: '+ item + '. New tag: LOC. Old tag: '+ tags[i[0]] 
                                          + '. Sentence: ' + ' '.join(sentence))
In [265]:
corr_pers = 0
for ner in ner_output:
    sentence = ner[0][0]
    words = []
    for i in range(len(sentence)):
        words.append(morph.parse(sentence[i])[0].normal_form)
    if len(words) != 0:
        tags = ner[1][0]
        for item in persons_final:
            spl = item.split()
            indices = [(i, i+len(spl)) for i in range(len(words)) if words[i:i+len(spl)] == spl] # [(0, 2), (3, 5)]
            if i != []:
                for i in indices:
                    if tags[i[0]] != 'B-PER' and tags[i[0]] != 'I-PER':
                        corr_pers += 1
                        corrections.append('Named entity: '+ item + '. New tag: PER. Old tag: '+ tags[i[0]] 
                                          + '. Sentence: ' + ' '.join(sentence))
In [272]:
f = open("stats_Мусоргский.txt", "w")
st = 'DeepPavlov found ' + str(len(pers) + len(locs) + len(orgs)) + '  named entities: ' + str(len(pers))+ ' persons, ' + str(len(locs)) + ' locations and ' + str(len(orgs)) + ' organizations. \n' + str(len(corrections)) + ' corrections in total, out of them ' +  str(corr_locs) + ' words were labeled as locations and ' + str(corr_pers) + ' were labeled as persons.'
f.write(st)
Out[272]:
197
In [249]:
def dict_to_csv(file_name, item):
    with open(file_name, 'w') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in item.items():
            writer.writerow([key, value])
In [262]:
def list_to_txt(file_name, item):
    with open(file_name, 'w') as f:
        for i in item:
            f.write("%s\n" % i)
In [270]:
list_to_txt("corrections_Мусоргский.txt", corrections)
In [271]:
dict_to_csv("locactions_Мусоргский.txt", locations_final)
dict_to_csv("persons_Мусоргский.txt", persons_final)