import os import pickle import re import collections import cytoolz as ct import numpy as np import codecs from gensim.parsing import preprocessing import jieba.jeiba as jb #-------------------------------------------------------------------------------------------# def load(line, tk, myre): #Tokenize zho line = [x for x in tk.cut(line, cut_all = True, HMM = True) if x != ""] line = " ".join(line) #Remove links, hashtags, at-mentions, mark-up, and "RT" line = re.sub(r"http\S+", "", line) line = re.sub(r"@\S+", "", line) line = re.sub(r"#\S+", "", line) line = re.sub("<[^>]*>", "", line) line = line.replace(" RT", "").replace("RT ", "") #Remove emojis line = re.sub(myre, "", line) #Remove punctuation and extra spaces line = ct.pipe(line, preprocessing.strip_tags, preprocessing.strip_punctuation, preprocessing.split_alphanum, preprocessing.strip_non_alphanum, preprocessing.strip_multiple_whitespaces ) #Strip and reduce to max training length line = line.lower().strip().lstrip() return line #-------------------------------------------------------------------------------------------# #Prep jeiba tk = jb.Tokenizer() tk.initialize() tk.lock = True #Prep regex try: # Wide UCS-4 build myre = re.compile(u'[' u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE) except re.error: # Narrow UCS-2 build myre = re.compile(u'(' u'\ud83c[\udf00-\udfff]|' u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' u'[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE) for filename in os.listdir("."): if filename.endswith(".txt"): print("Starting " + filename) full_text = [] temp_string = "" word_counter = 0 with codecs.open(filename, "r", encoding = "utf-8") as fo: for line in fo: line = load(line, tk, myre) for word in line.split(" "): word_counter += 1 temp_string += word + " " if word_counter >= 100: full_text.append(temp_string[:-1]) word_counter = 0 temp_string = "" with codecs.open("fixed." + filename, "w", encoding = "utf-8") as fw: for line in full_text: fw.write(line + "\n")