| | |
| |
|
| | import re |
| | import six |
| | import unicodedata |
| | import torch |
| | import rouge |
| | import numpy as np |
| | import random |
| | |
| | import sys |
| |
|
| | sys.path.append('../../../') |
| |
|
| | rouge = rouge.Rouge() |
| |
|
| |
|
| | is_py2 = six.PY2 |
| |
|
| | if not is_py2: |
| | basestring = str |
| |
|
| |
|
| | def _is_chinese_char(cp): |
| | """Checks whether CP is the codepoint of a CJK character.""" |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) |
| | or (cp >= 0x20000 and cp <= 0x2A6DF) |
| | or (cp >= 0x2A700 and cp <= 0x2B73F) |
| | or (cp >= 0x2B740 and cp <= 0x2B81F) |
| | or (cp >= 0x2B820 and cp <= 0x2CEAF) |
| | or (cp >= 0xF900 and cp <= 0xFAFF) |
| | or (cp >= 0x2F800 and cp <= 0x2FA1F)): |
| | return True |
| |
|
| | return False |
| |
|
| |
|
| | def _is_whitespace(char): |
| | """Checks whether `char` is a whitespace character.""" |
| | |
| | |
| | if char == " " or char == "\t" or char == "\n" or char == "\r": |
| | return True |
| | cat = unicodedata.category(char) |
| | if cat == "Zs": |
| | return True |
| | return False |
| |
|
| |
|
| | def _is_control(char): |
| | """Checks whether `char` is a control character.""" |
| | |
| | |
| | if char == "\t" or char == "\n" or char == "\r": |
| | return False |
| | cat = unicodedata.category(char) |
| | if cat.startswith("C"): |
| | return True |
| | return False |
| |
|
| |
|
| | def _is_punctuation(char): |
| | """Checks whether `char` is a punctuation character.""" |
| | cp = ord(char) |
| | |
| | |
| | |
| | |
| | if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or ( |
| | cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): |
| | return True |
| | cat = unicodedata.category(char) |
| | if cat.startswith("P"): |
| | return True |
| | return False |
| |
|
| |
|
| | def is_string(s): |
| | """判断是否是字符串 |
| | """ |
| | return isinstance(s, basestring) |
| |
|
| |
|
| | def is_stopwords(word, stopwords): |
| | if word in stopwords: |
| | return True |
| | else: |
| | return False |
| |
|
| |
|
| | def text_segmentate(text): |
| | en_seg_pattern = '((?:\\!|\\?|\\.|\\n)+(?:\\s)+)' |
| | ch_seg_pattern = '((?:?|!|。|\\n)+)' |
| | try: |
| | text = re.sub(en_seg_pattern, r'\1[SEP]', text) |
| | |
| | except Exception as e: |
| | print("input: ", text) |
| | raise e |
| | text = re.sub(ch_seg_pattern, r'\1[SEP]', text) |
| | |
| | text_list = text.split("[SEP]") |
| | text_list = list(filter(lambda x: len(x) != 0, text_list)) |
| | return text_list |
| |
|
| |
|
| | def load_stopwords(stopwords_path): |
| | stopwords_dict = {} |
| | with open(stopwords_path, "r") as rf: |
| | for line in rf: |
| | line = line.strip() |
| | if line not in stopwords_dict: |
| | stopwords_dict[line] = 0 |
| | else: |
| | pass |
| | return stopwords_dict |
| |
|
| |
|
| | def text_process(text, max_length): |
| | """分割文本 |
| | """ |
| | texts = text_segmentate(text) |
| |
|
| | result, length = [], 0 |
| | for text in texts: |
| | if length + len(text) > max_length * 1.3 and len(result) >= 3: |
| | yield result |
| | result, length = [], 0 |
| | result.append(text) |
| | length += len(text) |
| | if result and len(result) >= 3: |
| | yield result |
| |
|
| |
|
| | def text_process_split_long_content(text, max_length): |
| | """分割长文本 |
| | """ |
| | texts = text_segmentate(text) |
| |
|
| | result, sentence_num = "", 0 |
| | for text in texts: |
| | if len(text) > 500: |
| | if len(result) > 300 and sentence_num >= 3: |
| | yield result |
| | result, sentence_num = "", 0 |
| | else: |
| | result, sentence_num = "", 0 |
| | continue |
| | else: |
| | if len(result) + len(text) > max_length * 1.1 and sentence_num >= 3: |
| | yield result |
| | result, sentence_num = "", 0 |
| | result += text |
| | sentence_num += 1 |
| |
|
| | if result and sentence_num >= 3: |
| | yield result |
| |
|
| |
|
| | def gather_join(texts, idxs): |
| | """取出对应的text,然后拼接起来 |
| | """ |
| | return ''.join([texts[i] for i in idxs]) |
| |
|
| |
|
| | def gather_join_f1(texts_token, idsx): |
| | join_texts = [] |
| | for id in idsx: |
| | join_texts.extend(texts_token[id]) |
| | return join_texts |
| |
|
| |
|
| | def compute_rouge(source, target): |
| | """计算rouge-1、rouge-2、rouge-l |
| | """ |
| | source, target = ' '.join(source), ' '.join(target) |
| | try: |
| | scores = rouge.get_scores(hyps=source, refs=target) |
| | return { |
| | 'rouge-1': scores[0]['rouge-1']['f'], |
| | 'rouge-2': scores[0]['rouge-2']['f'], |
| | 'rouge-l': scores[0]['rouge-l']['f'], |
| | } |
| | except ValueError: |
| | return { |
| | 'rouge-1': 0.0, |
| | 'rouge-2': 0.0, |
| | 'rouge-l': 0.0, |
| | } |
| |
|
| |
|
| | def remove_stopwords(texts, stopwords_dict): |
| | for i, text in enumerate(texts): |
| | texts[i] = list(filter(lambda x: x not in stopwords_dict, text)) |
| | return texts |
| |
|
| |
|
| | def pseudo_summary_f1(texts, |
| | stopwords, |
| | tokenizer, |
| | max_length, |
| | rouge_strategy="rouge-l"): |
| | """构建伪标签摘要数据集 |
| | """ |
| | summary_rate = 0.25 |
| | max_length = max_length - 1 |
| | texts_tokens = [] |
| | sentece_idxs_vec = [] |
| | for text in texts: |
| | if len(texts) == 0: |
| | continue |
| | try: |
| | ids = tokenizer.encode(text.strip())[:-1] |
| | except ValueError: |
| | print("error, input : ", text) |
| | raise ValueError |
| | sentece_idxs_vec.append(ids) |
| | tokens = [tokenizer._convert_id_to_token(token) for token in ids] |
| | texts_tokens.append(tokens) |
| |
|
| | texts_tokens_rm = remove_stopwords(texts_tokens, stopwords) |
| | source_idxs, target_idxs = list(range(len(texts))), [] |
| |
|
| | assert len(texts_tokens) == len(texts) |
| | |
| | while True: |
| | sims = [] |
| | for i in source_idxs: |
| | new_source_idxs = [j for j in source_idxs if j != i] |
| | new_target_idxs = sorted(target_idxs + [i]) |
| | new_source = gather_join_f1(texts_tokens_rm, new_source_idxs) |
| | new_target = gather_join_f1(texts_tokens_rm, new_target_idxs) |
| | sim = compute_rouge(new_source, new_target)[rouge_strategy] |
| | sims.append(sim) |
| | new_idx = source_idxs[np.argmax(sims)] |
| | del sims |
| | source_idxs.remove(new_idx) |
| | target_idxs = sorted(target_idxs + [new_idx]) |
| | source = gather_join(texts, source_idxs) |
| | target = gather_join(texts, target_idxs) |
| | try: |
| | if (len(source_idxs) == 1 |
| | or 1.0 * len(target) / len(source) > summary_rate): |
| | break |
| | except ZeroDivisionError as e: |
| | print(e.meesage) |
| | print(texts) |
| | print("source: ", source) |
| | print("target: ", target) |
| |
|
| | if len(source) < len(target): |
| | source, target = target, source |
| | source_idxs, target_idxs = target_idxs, source_idxs |
| |
|
| | return sentece_idxs_vec, source, target, source_idxs, target_idxs |
| |
|
| |
|
| | def get_input_mask(sentence_id_vec, indexs): |
| | target_idxs = [] |
| | input_idxs = [] |
| | kMaskSentenceTokenId = 2 |
| | kEosTokenId = 1 |
| | mask_sentence_options_cumulative_prob = [0.9, 0.9, 1, 1] |
| | for index in indexs: |
| | target_idxs.extend(sentence_id_vec[index]) |
| | choice = random.uniform(0, 1) |
| | if choice < mask_sentence_options_cumulative_prob[0]: |
| | |
| | sentence_id_vec[index] = [kMaskSentenceTokenId] |
| | elif choice < mask_sentence_options_cumulative_prob[1]: |
| | |
| | replace_id = random.randint(0, len(sentence_id_vec)) |
| | sentence_id_vec[index] = sentence_id_vec[replace_id] |
| | elif choice < mask_sentence_options_cumulative_prob[2]: |
| | pass |
| | else: |
| | sentence_id_vec[index] = [] |
| |
|
| | target_idxs.append(kEosTokenId) |
| | |
| | for index, sentence_id in enumerate(sentence_id_vec): |
| | |
| | if len(sentence_id) == 0: |
| | continue |
| | input_idxs.extend(sentence_id_vec[index]) |
| |
|
| | input_idxs.append(kEosTokenId) |
| | return input_idxs, target_idxs |
| |
|
| |
|
| | def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, |
| | decoder_start_token_id: int): |
| | """ |
| | Shift input ids one token to the right. |
| | """ |
| | shifted_input_ids = input_ids.new_zeros(input_ids.shape) |
| | shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() |
| | shifted_input_ids[:, 0] = decoder_start_token_id |
| |
|
| | if pad_token_id is None: |
| | raise ValueError("self.model.config.pad_token_id has to be defined.") |
| | |
| | shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) |
| |
|
| | return shifted_input_ids |
| |
|
| |
|
| | def padding_to_maxlength(ids, max_length, pad_id): |
| | cur_len = len(ids) |
| | len_diff = max_length - cur_len |
| | return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff |
| |
|