selenium批量提取antiSMASH結果中的核心基因AA序列

阿新 • • 發佈：2021-01-30

技術標籤：antismash 生物資訊學 python selenium 生物資訊學

import os
from selenium  import webdriver
import time
import re

browser = webdriver.Chrome() # 全域性變數 瀏覽器

def click_event():
    """
    所有點選事件
    返回 一個菌株的polygons_name 、AA_seqs 待寫入文字中
    """
    polygons_name = []  # 用於一個菌株儲存所有核心基因名稱 

    AA_seqs = []  # 用於儲存一個菌株所有核心基因序列

    # 所有rebutton按鈕儲存到列表中，待迴圈點選
    regbutton_list=browser.find_elements_by_css_selector('div[style="display: flex; flex-wrap: wrap"]>div')
    for regbutton in regbutton_list:
        regbutton.click()  # 點選單個區域
        # get_heading()
        polygon_name, 
AA_seq = click_core() # 點選核心基因polygon，並獲取AA序列及其名稱

        # 將區域核心基因名稱整理好存入polygons_name
        i=2
        for polygon in polygon_name:
            if polygon in polygons_name:
                polygon = polygon+str(i)
                polygons_name.append(polygon)
                i +=1
            else 
:
                polygons_name.append(polygon)

        # 將區域核心基因序列整理好入AA_seq
        for AA in AA_seq:
            AA_seqs.append(AA)

        time.sleep(2)
    # get_aaseq()
    # print(regbutton_list)
    # print(polygons_name)
    # print(AA_seqs)
    return polygons_name,AA_seqs

def click_core():
    # 點選指定區域的所有核心基因，返回此區域所有核心基因名稱和序列內容
    core_list = browser.find_elements_by_css_selector('polygon[class="svgene-type-biosynthetic svgene-orf svgene-selected-orf"]')
    # head_list = get_heading()
    # print(head_list)
    polygon_name = [] #單個區域所有核心基因的名稱
    AA_seq = []  # 單個區域所有核心基因的序列
    for polygon in core_list:
        polygon.click()  # 點選核心基因

        # 獲取區域名稱，待給所有核心基因序列取名
        head_list = get_heading()
        for head in head_list:
            if  head !="":
                polygon_name.append(head)

        # 獲取核心基因序列
        aaseq = get_aaseq()[0]  # 單個核心基因序列
        AA_seq.append(aaseq)


        time.sleep(1)  # 休息1s點選下一個polygon（核心基因一般一個或兩個)
    # print(polygon_name)
    # print(AA_seq)
    return polygon_name,AA_seq

def get_heading():
    """
    獲取單個polybutton（區域）的名字 ，一個區域可能有多個核心基因
    <div class="page" id="r10c1" style="display: block;">
 <div class="region-grid">
  <div class="content">
    <div class="description-container">
      <div class="heading">
      div[style="display: block;"]
      'page>region-grid>content>description-container>heading'
    """
    head_list = []
    heading = browser.find_elements_by_css_selector('.page>.region-grid>.content>.description-container>.heading')
    for head in heading:
        a= head.text
        a = a.replace(" - ","|")
        a = a.replace(" ","")
        head_list.append(a)
    # print(heading.text)
    return head_list


def get_aaseq():
    aaseq = []
    # 獲取AA序列 .page>.region-grid>.focus-panel>.focus-panel-content focus-panel-content-r1c1>.focus-clipboard>.clipboard-copy
    # copy = browser.find_elements_by_css_selector('div[class="focus-clipboard"]>span[class="clipboard-copy"]')
    copy = browser.find_elements_by_css_selector('div[style=""]>.region-grid>.focus-panel div[class="focus-clipboard"]')
    # print(copy)
    for c in copy:
        copy_html = c.get_attribute('outerHTML')  # 獲得複製AA序列html文字內容，待正則表示式提取屬性值
        # print(copy_html)
        data_rule = re.compile(' AA sequence: <span class="clipboard-copy" data-seq="(.*)"')  # 正則表示式規則  還可以選擇核酸序列
        data_seq = re.findall(data_rule, copy_html)[0] # AA序列內容
        aaseq.append(data_seq)
        # print(data_seq)
    return aaseq


def sele_html(all_path,file_list):
    """
    selenium自動化操作本地所有html,並將結果寫入文字
    """
    for file_name in file_list:
        url = "file:///"+all_path + file_name + "/index.html"
        browser.get(url)
        time.sleep(2)     # 載入頁面休息2s

        polygons_name,AA_seqs=click_event() # 點選事件,獲取此菌株所有核心基因名稱及序列

        file_path = "檔案儲存路徑"+file_name+".fasta"  ###### 檔案存入路徑 ######
        # file_path = file_name + ".fasta"
        file = open(file_path,"w+")
        for i in range(len(polygons_name)):
            file.write(">"+polygons_name[i]+"\n")
            file.write(AA_seqs[i]+'\n')
        print(file_name+" 寫入完畢！")
        file.close()

    browser.quit()


def main():
    all_path = "antismash結果路徑"  #### 修改此處antismash結果目錄路徑即可 ######
    # all_path = "E:\\1A生信資料
    file_list = os.listdir(all_path)
    sele_html(all_path,file_list)


if __name__ == '__main__':
    main()