selenium批量提取antiSMASH結果中的核心基因AA序列
阿新 • • 發佈:2021-01-30
技術標籤:antismash生物資訊學pythonselenium生物資訊學
import os
from selenium import webdriver
import time
import re
browser = webdriver.Chrome() # 全域性變數 瀏覽器
def click_event():
"""
所有點選事件
返回 一個菌株的polygons_name 、AA_seqs 待寫入文字中
"""
polygons_name = [] # 用於一個菌株儲存所有核心基因名稱
AA_seqs = [] # 用於儲存一個菌株所有核心基因序列
# 所有rebutton按鈕儲存到列表中,待迴圈點選
regbutton_list=browser.find_elements_by_css_selector('div[style="display: flex; flex-wrap: wrap"]>div')
for regbutton in regbutton_list:
regbutton.click() # 點選單個區域
# get_heading()
polygon_name, AA_seq = click_core() # 點選核心基因polygon,並獲取AA序列及其名稱
# 將區域核心基因名稱整理好存入polygons_name
i=2
for polygon in polygon_name:
if polygon in polygons_name:
polygon = polygon+str(i)
polygons_name.append(polygon)
i +=1
else :
polygons_name.append(polygon)
# 將區域核心基因序列整理好入AA_seq
for AA in AA_seq:
AA_seqs.append(AA)
time.sleep(2)
# get_aaseq()
# print(regbutton_list)
# print(polygons_name)
# print(AA_seqs)
return polygons_name,AA_seqs
def click_core():
# 點選指定區域的所有核心基因,返回此區域所有核心基因名稱和序列內容
core_list = browser.find_elements_by_css_selector('polygon[class="svgene-type-biosynthetic svgene-orf svgene-selected-orf"]')
# head_list = get_heading()
# print(head_list)
polygon_name = [] #單個區域所有核心基因的名稱
AA_seq = [] # 單個區域所有核心基因的序列
for polygon in core_list:
polygon.click() # 點選核心基因
# 獲取區域名稱,待給所有核心基因序列取名
head_list = get_heading()
for head in head_list:
if head !="":
polygon_name.append(head)
# 獲取核心基因序列
aaseq = get_aaseq()[0] # 單個核心基因序列
AA_seq.append(aaseq)
time.sleep(1) # 休息1s點選下一個polygon(核心基因一般一個或兩個)
# print(polygon_name)
# print(AA_seq)
return polygon_name,AA_seq
def get_heading():
"""
獲取單個polybutton(區域)的名字 ,一個區域可能有多個核心基因
<div class="page" id="r10c1" style="display: block;">
<div class="region-grid">
<div class="content">
<div class="description-container">
<div class="heading">
div[style="display: block;"]
'page>region-grid>content>description-container>heading'
"""
head_list = []
heading = browser.find_elements_by_css_selector('.page>.region-grid>.content>.description-container>.heading')
for head in heading:
a= head.text
a = a.replace(" - ","|")
a = a.replace(" ","")
head_list.append(a)
# print(heading.text)
return head_list
def get_aaseq():
aaseq = []
# 獲取AA序列 .page>.region-grid>.focus-panel>.focus-panel-content focus-panel-content-r1c1>.focus-clipboard>.clipboard-copy
# copy = browser.find_elements_by_css_selector('div[class="focus-clipboard"]>span[class="clipboard-copy"]')
copy = browser.find_elements_by_css_selector('div[style=""]>.region-grid>.focus-panel div[class="focus-clipboard"]')
# print(copy)
for c in copy:
copy_html = c.get_attribute('outerHTML') # 獲得複製AA序列html文字內容,待正則表示式提取屬性值
# print(copy_html)
data_rule = re.compile(' AA sequence: <span class="clipboard-copy" data-seq="(.*)"') # 正則表示式規則 還可以選擇核酸序列
data_seq = re.findall(data_rule, copy_html)[0] # AA序列內容
aaseq.append(data_seq)
# print(data_seq)
return aaseq
def sele_html(all_path,file_list):
"""
selenium自動化操作本地所有html,並將結果寫入文字
"""
for file_name in file_list:
url = "file:///"+all_path + file_name + "/index.html"
browser.get(url)
time.sleep(2) # 載入頁面休息2s
polygons_name,AA_seqs=click_event() # 點選事件,獲取此菌株所有核心基因名稱及序列
file_path = "檔案儲存路徑"+file_name+".fasta" ###### 檔案存入路徑 ######
# file_path = file_name + ".fasta"
file = open(file_path,"w+")
for i in range(len(polygons_name)):
file.write(">"+polygons_name[i]+"\n")
file.write(AA_seqs[i]+'\n')
print(file_name+" 寫入完畢!")
file.close()
browser.quit()
def main():
all_path = "antismash結果路徑" #### 修改此處antismash結果目錄路徑即可 ######
# all_path = "E:\\1A生信資料
file_list = os.listdir(all_path)
sele_html(all_path,file_list)
if __name__ == '__main__':
main()