mongodb中文文字資料(新聞評論)預處理程式碼(python+java)
阿新 • • 發佈:2018-11-19
中文文字資料預處理
Mongodb資料匯出到txt文件
#python
# coding=utf-8
from pymongo import MongoClient
# 建立 MongoDB 資料庫連線
client = MongoClient('localhost', 27017)
# 連線所需資料庫,news為資料庫名
db = client.news
# 連線所用集合,也就是我們通常所說的表
collection = db.news_comment2_600w
with open("comment.txt", 'w+', encoding='UTF-8') as f:
for txt in collection.find({"url_hash": "aad54fce101da05eb1688ef0389a8e84559f4fdf"}, {"text": 1}):
if 'text' in txt and txt['text']:
result = txt['text']+"\n" # 按行讀入
f.writelines( result)
將檔案按行寫入陣列
#python
class StrToArr:
@staticmethod
def cn_str_to_arr(path, temp):
with open(path, "r", encoding='GBK') as ad_file:
for i in ad_file:
temp.append(i)
#java
private List<String> segLines(File file) throws Exception {
BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(file), "GBK"));
List<String> temp = new ArrayList<>();
String str;
while ((str = bf.readLine()) != null) {
//String str2 = str + "\r\n";
temp.add(str);
}
bf.close();
return temp;
##資料去重,去非中文文字(表情,英文等),過濾無用資訊
#python
def pretreatment():
temp = []
with open("comment.txt", "r", encoding='UTF-8') as pre_file:
for i in pre_file:
# pattern = re.compile(r'.*?([\u4E00-\u9FA5]+造謠.*)')
pattern = re.compile(r'.*?((造謠.*)|(網易.*)|(沒死.*)|(媒體.*)|(小編.*)|(小便.*))')
bo_ol = pattern.match(i)
# print(bo_ol)
if i not in temp and bo_ol is None: # 去重和正則表達除去無用的資訊
content = re.sub(r' ', '', i) # 去空格
sub_not_cn = re.sub(u'[^\n\w\u4E00-\u9FA5]+', '', content)
temp.append(sub_not_cn)
with open("uni_data.txt", "w+", encoding='UTF-8') as uni_file:
for j in range(len(temp)):
result = temp[j]
uni_file.writelines(result)
if __name__ == '__main__':
pretreatment()
文字批量修改(加字尾等)
#python
def add_ad():
temp = []
with open("***.txt", "r", encoding='GBK') as ad_file:
for i in ad_file:
t = i.replace("\n", " "+"ad")
temp.append(t+"\n")
with open("ad_word.txt", "w", encoding="GBK") as f:
for i in range(len(temp)):
result = temp[i]
f.writelines(result)
if __name__ == "__main__":
add_ad()
個人認為,Java功能確實很強大,但python在自然處理的方面確實也有不俗的能力,在讀取檔案的部分,python更為方便。特別是讀取excel表格時,python可以完美的將每一列或者每一行作為索引,直接在文件中操作資料定位,關聯等,而Java相對比較繁瑣。
對中文文字的情感分析的程式碼,以及超級工具包感興趣的可以看下一篇文章,我打算把自己半個月折騰的成果分享出來,不算精彩卻很使用,便於二次開發。