Source code for silvio.extensions.sets.shotgun_sequencing.evaluation


from typing import Dict, List, Tuple
from math import inf

import numpy as np
from Bio.Seq import Seq

from ...common import Base
from .datatype import BaseProbDict, LocalizedSequence, EstimatedSequence



[docs]def calc_total_score ( estseq:EstimatedSequence, genome:Seq ) -> Tuple[float,int] : """ Shift the genome around the heatmap and get the best overall match. Score is a simple addition of all ratios of the correct base, normalized as percentage. """ gen_len = len(genome) est_len = len(estseq) best_score = -inf best_genome_start = 0 for genome_start in range( -gen_len, est_len ) : score = 0 for g_i in range(gen_len) : # g_i = index on genome sequence b_i = genome_start + g_i # b_i = map to an index in the bundle if 0 <= b_i < est_len : # only add score if it overlaps with the heatmap, 0 otherwise. gen_base = genome[g_i] hm_total = sum([ estseq.base_probs[base][b_i] for base in ['A','C','G','T'] ]) if hm_total > 0 : score += ( estseq.base_probs[gen_base][b_i] / hm_total ) if score > best_score : best_score = score best_genome_start = genome_start return ( best_score / len(genome), best_genome_start )
[docs]def calc_sequence_score ( locseqs:List[LocalizedSequence], genome:Seq, genome_start:int ) -> List[float] : """ Calculate the matching score of each singular localized sequence. """ # genome_start is relative to min(locseqs.locus) when that is shifted to zero. locseq_start = int(np.amin([ locseq.locus for locseq in locseqs ])) rel_genome_start = genome_start + locseq_start seq_scores = [ 0 for _ in locseqs ] for l_i, locseq in enumerate(locseqs) : score = 0 for s_i in range(len(locseq.sequence)) : g_i = locseq.locus + s_i - rel_genome_start if 0 <= g_i < len(genome) : if locseq.sequence[s_i] == genome[g_i] : score += 1 seq_scores[l_i] = score / len(locseq.sequence) return seq_scores
[docs]def evaluate_sequence ( estimated_sequence:EstimatedSequence, genome:Seq ) -> float : """ Compares an estimated sequence with the real genome and returns the true positive rate. Return value is a number between 0 (worst score) and 1 (best score). """ return calc_total_score( estimated_sequence, genome )[0]