Difference: ProtocolsPythonSnippets (1 vs. 3)

Revision 32021-11-30 - MattMcGuffie

META TOPICPARENT	name="ComputationList"

Python snippets for biology

Requires BioPython to be installed

Open common formats:

.gbk / .gbf / .gb

import SeqIO
with open('/my/path/file.gb','r') as file_handle:
    record_dict = SeqIO.to_dict(SeqIO.parse(file_handle, 'gb'))
gbkFile = record_dict[list(record_dict.keys())[0]]
# the above is if there is only 1 record in the GBK file
# if multiple files are in the record, such as a genome&plas,
# this will only extract the first record

.fa / .fasta / .fna

import SeqIO
fastaList = list(SeqIO.parse("path/file.fasta", "fasta"))

Write common formats:

.gbk / .gbf / .gb

import SeqIO
SeqIO.write(seqRecordObj_or_list,'/my/path/file.gbk', 'gb')

.fa / .fasta / .fna

import SeqIO
SeqIO.write(seqRecordObj_or_list,'/my/path/file.fasta', 'fasta')

local BLASTing from Python

Also requires NCBI BLAST command line software, a local BLAST database, and Pandas %CODE{"python"}% from Bio.Seq import Seq from Bio import SeqIO from Bio.SeqRecord import SeqRecord import subprocess from tempfile import NamedTemporaryFile import pandas as pd

Changed:

<
<

def BLAST(seq, db = 'nr_db', type = "p"):

>
>

def BLAST(seq, db = 'path/to/db', Type = "blastn"):

# 'seq' is a sequence (as a str) of a protein or nucleotide sequence # 'db' points to location of local BLAST database # 'type' specifies the type of BLAST (e.g. 'n', 'p', 'x', etc)

query = NamedTemporaryFile() tmp = NamedTemporaryFile() SeqIO.write(SeqRecord(Seq(seq), id="temp"), query.name, "fasta")

flags = 'qstart qend sseqid sframe pident slen sseq length sstart send qlen' # 'flags' specifies the specific outputs for 'output format 6' in the BLAST CL software extras = "-max_target_seqs 20000 -culling_limit 10 -perc_identity 75" # 'extras' are further flags that can be called on the CL

subprocess.call( #the actual CL BLAST

Changed:

<
<

(f'blast{type} -query {query.name} -out {tmp.name} ' f'-db {db} {extras} -word_size {str(wordsize)} -outfmt "6 {flags}"'),

>
>

(f'{Type} -query {query.name} -out {tmp.name} ' f'-db {db} {extras} -outfmt "6 {flags}"'),

shell=True)

with open(tmp.name, "r") as file_handle: #opens BLAST file align = file_handle.readlines()

tmp.close() query.close()

df = pd.DataFrame([ele.split() for ele in align], columns = flags.split()) df = df.apply(pd.to_numeric, errors='ignore') # puts the output of BLAST into a tidy Pandas dataframe

return df %ENDCODE%

Revision 22020-12-03 - JeffreyBarrick

META TOPICPARENT	name="ComputationList"

Python snippets for biology

Requires BioPython to be installed

Open common formats:

.gbk / .gbf / .gb

import SeqIO
with open('/my/path/file.gb','r') as file_handle:
    record_dict = SeqIO.to_dict(SeqIO.parse(file_handle, 'gb'))
gbkFile = record_dict[list(record_dict.keys())[0]]
# the above is if there is only 1 record in the GBK file
# if multiple files are in the record, such as a genome&plas,
# this will only extract the first record

.fa / .fasta / .fna

import SeqIO
fastaList = list(SeqIO.parse("path/file.fasta", "fasta"))

Write common formats:

.gbk / .gbf / .gb

import SeqIO
SeqIO.write(seqRecordObj_or_list,'/my/path/file.gbk', 'gb')

.fa / .fasta / .fna

import SeqIO
SeqIO.write(seqRecordObj_or_list,'/my/path/file.fasta', 'fasta')

local BLASTing from Python

Also requires NCBI BLAST command line software, a local BLAST database, and Pandas

from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import subprocess
from tempfile import NamedTemporaryFile
import pandas as pd

def BLAST(seq, db = 'nr_db', type = "p"):
    # 'seq' is a sequence (as a str) of a protein or nucleotide sequence
    # 'db' points to location of local BLAST database
    # 'type' specifies the type of BLAST (e.g. 'n', 'p', 'x', etc)

    query = NamedTemporaryFile()
    tmp = NamedTemporaryFile()
    SeqIO.write(SeqRecord(Seq(seq), id="temp"), query.name, "fasta")

    flags = 'qstart qend sseqid sframe pident slen sseq length sstart send qlen'
    # 'flags' specifies the specific outputs for 'output format 6' in the BLAST CL software
    extras = "-max_target_seqs 20000 -culling_limit 10 -perc_identity 75"
    # 'extras' are further flags that can be called on the CL

    subprocess.call( #the actual CL BLAST
            (f'blast{type} -query {query.name} -out {tmp.name} '
            f'-db {db} {extras} -word_size {str(wordsize)} -outfmt "6 {flags}"'),
            shell=True)

    with open(tmp.name, "r") as file_handle:  #opens BLAST file
        align = file_handle.readlines()

    tmp.close()
    query.close()

    df = pd.DataFrame([ele.split() for ele in align], columns = flags.split())
    df = df.apply(pd.to_numeric, errors='ignore')
    # puts the output of BLAST into a tidy Pandas dataframe

    return df

Deleted:

<
<

-- No permission to view TWiki.UserReports

Comments

No such template def TMPL:DEF{PROMPT:above}

Revision 12020-12-03 - MattMcGuffie

META TOPICPARENT	name="ComputationList"

Python snippets for biology

Requires BioPython to be installed

Open common formats:

.gbk / .gbf / .gb

import SeqIO
with open('/my/path/file.gb','r') as file_handle:
    record_dict = SeqIO.to_dict(SeqIO.parse(file_handle, 'gb'))
gbkFile = record_dict[list(record_dict.keys())[0]]
# the above is if there is only 1 record in the GBK file
# if multiple files are in the record, such as a genome&plas,
# this will only extract the first record

.fa / .fasta / .fna

import SeqIO
fastaList = list(SeqIO.parse("path/file.fasta", "fasta"))

Write common formats:

.gbk / .gbf / .gb

import SeqIO
SeqIO.write(seqRecordObj_or_list,'/my/path/file.gbk', 'gb')

.fa / .fasta / .fna

import SeqIO
SeqIO.write(seqRecordObj_or_list,'/my/path/file.fasta', 'fasta')

local BLASTing from Python

Also requires NCBI BLAST command line software, a local BLAST database, and Pandas

from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import subprocess
from tempfile import NamedTemporaryFile
import pandas as pd

def BLAST(seq, db = 'nr_db', type = "p"):
    # 'seq' is a sequence (as a str) of a protein or nucleotide sequence
    # 'db' points to location of local BLAST database
    # 'type' specifies the type of BLAST (e.g. 'n', 'p', 'x', etc)

    query = NamedTemporaryFile()
    tmp = NamedTemporaryFile()
    SeqIO.write(SeqRecord(Seq(seq), id="temp"), query.name, "fasta")

    flags = 'qstart qend sseqid sframe pident slen sseq length sstart send qlen'
    # 'flags' specifies the specific outputs for 'output format 6' in the BLAST CL software
    extras = "-max_target_seqs 20000 -culling_limit 10 -perc_identity 75"
    # 'extras' are further flags that can be called on the CL

    subprocess.call( #the actual CL BLAST
            (f'blast{type} -query {query.name} -out {tmp.name} '
            f'-db {db} {extras} -word_size {str(wordsize)} -outfmt "6 {flags}"'),
            shell=True)

    with open(tmp.name, "r") as file_handle:  #opens BLAST file
        align = file_handle.readlines()

    tmp.close()
    query.close()

    df = pd.DataFrame([ele.split() for ele in align], columns = flags.split())
    df = df.apply(pd.to_numeric, errors='ignore')
    # puts the output of BLAST into a tidy Pandas dataframe

    return df

Warning: Can't INCLUDE TWiki.UserReports repeatedly, topic is already included.

Comments

No such template def TMPL:DEF{PROMPT:above}

View topic | History: r3 < r2 < r1 | More topic actions...