|
04、项目开发的一般流程与数据预处理_笔记与视频
课程配套的数据cnews_data.zip下载【回复本帖可见】:
高清视频下载地址【回复本帖可见】:
- # -*- coding: utf-8 -*-
- __author__ = u'东方耀 微信:dfy_88888'
- __date__ = '2019/3/21 20:28'
- __product__ = 'PyCharm'
- __filename__ = '1_pre_process'
- import sys
- import os
- import jieba
- # input file
- train_file = './cnews_data/cnews.train.txt'
- test_file = './cnews_data/cnews.test.txt'
- val_file = './cnews_data/cnews.val.txt'
- # output file 分词后的
- seg_train_file = './cnews_data/cnews.train.seg.txt'
- seg_test_file = './cnews_data/cnews.test.seg.txt'
- seg_val_file = './cnews_data/cnews.val.seg.txt'
- # 词表文件
- vocab_file = './cnews_data/cnews.vocab.txt'
- # 类别文件
- category_file = './cnews_data/cnews.category.txt'
- with open(train_file, 'r', encoding='utf-8') as f:
- lines = f.readlines()
- # label, content = lines[10550].strip('\r\n').split('\t')
- # print(label)
- # print(content)
- # word_iter = jieba.cut(content)
- # print('/'.join(word_iter))
- def generate_seg_file(input_file, output_seg_file):
- with open(input_file, 'r', encoding='utf-8') as f:
- lines = f.readlines()
- with open(output_seg_file, 'w', encoding='utf-8') as f:
- for line in lines:
- label, content = line.strip('\r\n').split('\t')
- word_iter = jieba.cut(content)
- word_content = ''
- for word in word_iter:
- word = word.strip(' ')
- if word != '':
- word_content += word + ' '
- out_line = '%s\t%s\n' % (label, word_content.strip(' '))
- f.write(out_line)
- # generate_seg_file(val_file, seg_val_file)
- # generate_seg_file(train_file, seg_train_file)
- # generate_seg_file(test_file, seg_test_file)
- def generate_vocab_file(input_seg_file, output_vocab_file):
- with open(input_seg_file, 'r', encoding='utf-8') as f:
- lines = f.readlines()
- # dict {key, value} word:frequency
- word_dict = {}
- for line in lines:
- label, content = line.strip('\r\n').split('\t')
- for word in content.split(' '):
- word_dict.setdefault(word, 0)
- word_dict[word] += 1
- # sorted_word_dict [(word, frequency), (), ()]
- sorted_word_dict = sorted(word_dict.items(), key=lambda d: d[1], reverse=True)
- with open(output_vocab_file, 'w', encoding='utf-8') as f:
- f.write('<UNK>\t999999999999\n')
- for item in sorted_word_dict:
- f.write('%s\t%s\n' % (item[0], item[1]))
- # generate_vocab_file(seg_train_file, vocab_file)
- def generate_category_file(input_file, output_category_file):
- with open(input_file, 'r', encoding='utf-8') as f:
- lines = f.readlines()
- category_dict = {}
- for line in lines:
- label, content = line.strip('\r\n').split('\t')
- category_dict.setdefault(label, 0)
- category_dict[label] += 1
- category_number = len(category_dict)
- with open(output_category_file, 'w', encoding='utf-8') as f:
- for category in category_dict.keys():
- out_line = '%s\n' % category
- print('%s\t%d\n' % (category, category_dict[category]))
- f.write(out_line)
- generate_category_file(seg_train_file, category_file)
复制代码
|
|