|
05、项目相关数据flickr30k介绍与词表生成_笔记
训练数据flickr30k(已标注好的图像)下载:链接:https://pan.baidu.com/s/1ola0honCkwPLdV4DW0TGsg
提取码:eggh
预训练好的Inception CNN神经网络数据下载:链接:https://pan.baidu.com/s/1rHivh5dlG6yevwCmY19Y6w
提取码:5ter
- # -*- coding: utf-8 -*-
- __author__ = u'东方耀 微信:dfy_88888'
- __date__ = '2019/5/15 11:30'
- __product__ = 'PyCharm'
- __filename__ = 'generate_vocab'
- import pprint
- input_description_file = './results_20130124.token'
- output_vocab_file = './vocab.txt'
- def count_vocab(input_description_file):
- with open(input_description_file, 'r', encoding='utf-8') as f:
- lines = f.readlines()
- max_length_of_sentences = 0
- # key: 句子长度 value: 长度的次数
- length_dict = {}
- # key: 词 value: 词频
- vocab_dict = {}
- for line in lines:
- image_id, description = line.strip('\n').split('\t')
- words = description.strip(' ').split()
- max_length_of_sentences = max(max_length_of_sentences, len(words))
- length_dict.setdefault(len(words), 0)
- length_dict[len(words)] += 1
- for word in words:
- vocab_dict.setdefault(word, 0)
- vocab_dict[word] += 1
- print('max_length_of_sentences: %d' % max_length_of_sentences)
- pprint.pprint(length_dict)
- return vocab_dict
- vocab_dict = count_vocab(input_description_file)
- sorted_vocab_dict = sorted(vocab_dict.items(), key=lambda d: d[1], reverse=True)
- with open(output_vocab_file, 'w', encoding='utf-8') as f:
- f.write('<UNK>\t99999999999999999\n')
- for item in sorted_vocab_dict:
- f.write('%s\t%d\n' % (item[0], item[1]))
复制代码
|
|