1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
| import collections import re
def read_time_machine(): with open('./data/timemachine.txt') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
def tokenize(lines, token='word'): '''将文本拆分为单词或字符词元''' if token == 'word': return [line.split() for line in lines] elif token == 'char': return [list(line) for line in lines] else: print('错误:未知类型词元:' + token)
def count_corpus(tokens): '''统计词元的频率''' if len(tokens) == 0 or isinstance(tokens[0], list): tokens = [token for line in tokens for token in line] return collections.Counter(tokens)
class Vocab(): '''文本词表''' def __init__(self, tokens=None, min_freq=0, reserved_tokens=None): ''' tokens:列表,包含了所有词元 min_freq:整数,表示词元的最小出现频率,低于这个频率的词元不会被加入词汇表 reserved_tokens:列表,包含了需要保留的特定词元,例如<unk>(未知词元) ''' if tokens is None: tokens = [] if reserved_tokens is None: reserved_tokens = [] counter = count_corpus(tokens) self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
self.idx_to_token = ['<unk>'] + reserved_tokens self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
for token, freq in self._token_freqs: if freq < min_freq: break if token not in self.token_to_idx: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1
def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.to_tokens(index) for index in indices] @property def unk(self): return 0 @property def tokens_freqs(self): return self._token_freqs
def load_corpus_time_machine(max_tokens=-1): '''返回timemachine数据集的词元索引列表和词表''' lines = read_time_machine() tokens = tokenize(lines) vocab = Vocab(tokens)
corpus = [vocab[token] for line in tokens for token in line] if max_tokens > 0: corpus = corpus[:max_tokens]
return corpus, vocab
|