1. 程式人生 > 其它 >selenium批量提取antiSMASH結果中的核心基因AA序列

selenium批量提取antiSMASH結果中的核心基因AA序列

技術標籤:antismash生物資訊學pythonselenium生物資訊學

import os
from selenium  import webdriver
import time
import re

browser = webdriver.Chrome() # 全域性變數 瀏覽器

def click_event():
    """
    所有點選事件
    返回 一個菌株的polygons_name 、AA_seqs 待寫入文字中
    """
    polygons_name = []  # 用於一個菌株儲存所有核心基因名稱
AA_seqs = [] # 用於儲存一個菌株所有核心基因序列 # 所有rebutton按鈕儲存到列表中,待迴圈點選 regbutton_list=browser.find_elements_by_css_selector('div[style="display: flex; flex-wrap: wrap"]>div') for regbutton in regbutton_list: regbutton.click() # 點選單個區域 # get_heading() polygon_name,
AA_seq = click_core() # 點選核心基因polygon,並獲取AA序列及其名稱 # 將區域核心基因名稱整理好存入polygons_name i=2 for polygon in polygon_name: if polygon in polygons_name: polygon = polygon+str(i) polygons_name.append(polygon) i +=1 else
: polygons_name.append(polygon) # 將區域核心基因序列整理好入AA_seq for AA in AA_seq: AA_seqs.append(AA) time.sleep(2) # get_aaseq() # print(regbutton_list) # print(polygons_name) # print(AA_seqs) return polygons_name,AA_seqs def click_core(): # 點選指定區域的所有核心基因,返回此區域所有核心基因名稱和序列內容 core_list = browser.find_elements_by_css_selector('polygon[class="svgene-type-biosynthetic svgene-orf svgene-selected-orf"]') # head_list = get_heading() # print(head_list) polygon_name = [] #單個區域所有核心基因的名稱 AA_seq = [] # 單個區域所有核心基因的序列 for polygon in core_list: polygon.click() # 點選核心基因 # 獲取區域名稱,待給所有核心基因序列取名 head_list = get_heading() for head in head_list: if head !="": polygon_name.append(head) # 獲取核心基因序列 aaseq = get_aaseq()[0] # 單個核心基因序列 AA_seq.append(aaseq) time.sleep(1) # 休息1s點選下一個polygon(核心基因一般一個或兩個) # print(polygon_name) # print(AA_seq) return polygon_name,AA_seq def get_heading(): """ 獲取單個polybutton(區域)的名字 ,一個區域可能有多個核心基因 <div class="page" id="r10c1" style="display: block;"> <div class="region-grid"> <div class="content"> <div class="description-container"> <div class="heading"> div[style="display: block;"] 'page>region-grid>content>description-container>heading' """ head_list = [] heading = browser.find_elements_by_css_selector('.page>.region-grid>.content>.description-container>.heading') for head in heading: a= head.text a = a.replace(" - ","|") a = a.replace(" ","") head_list.append(a) # print(heading.text) return head_list def get_aaseq(): aaseq = [] # 獲取AA序列 .page>.region-grid>.focus-panel>.focus-panel-content focus-panel-content-r1c1>.focus-clipboard>.clipboard-copy # copy = browser.find_elements_by_css_selector('div[class="focus-clipboard"]>span[class="clipboard-copy"]') copy = browser.find_elements_by_css_selector('div[style=""]>.region-grid>.focus-panel div[class="focus-clipboard"]') # print(copy) for c in copy: copy_html = c.get_attribute('outerHTML') # 獲得複製AA序列html文字內容,待正則表示式提取屬性值 # print(copy_html) data_rule = re.compile(' AA sequence: <span class="clipboard-copy" data-seq="(.*)"') # 正則表示式規則 還可以選擇核酸序列 data_seq = re.findall(data_rule, copy_html)[0] # AA序列內容 aaseq.append(data_seq) # print(data_seq) return aaseq def sele_html(all_path,file_list): """ selenium自動化操作本地所有html,並將結果寫入文字 """ for file_name in file_list: url = "file:///"+all_path + file_name + "/index.html" browser.get(url) time.sleep(2) # 載入頁面休息2s polygons_name,AA_seqs=click_event() # 點選事件,獲取此菌株所有核心基因名稱及序列 file_path = "檔案儲存路徑"+file_name+".fasta" ###### 檔案存入路徑 ###### # file_path = file_name + ".fasta" file = open(file_path,"w+") for i in range(len(polygons_name)): file.write(">"+polygons_name[i]+"\n") file.write(AA_seqs[i]+'\n') print(file_name+" 寫入完畢!") file.close() browser.quit() def main(): all_path = "antismash結果路徑" #### 修改此處antismash結果目錄路徑即可 ###### # all_path = "E:\\1A生信資料 file_list = os.listdir(all_path) sele_html(all_path,file_list) if __name__ == '__main__': main()