From e9d84e5de3470dd8d449e4bcdda16fd090302d17 Mon Sep 17 00:00:00 2001 From: Bahram Aghaei Date: Thu, 11 Apr 2019 14:56:37 +0430 Subject: [PATCH] make it better --- Chapter1/trivial_compression.py | 81 ++++++++++++++------------------- 1 file changed, 33 insertions(+), 48 deletions(-) diff --git a/Chapter1/trivial_compression.py b/Chapter1/trivial_compression.py index 3cf2a9f..0735955 100644 --- a/Chapter1/trivial_compression.py +++ b/Chapter1/trivial_compression.py @@ -1,54 +1,39 @@ # trivial_compression.py -# From Classic Computer Science Problems in Python Chapter 1 -# Copyright 2018 David Kopec -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - class CompressedGene: - def __init__(self, gene: str) -> None: - self._compress(gene) + MAP = { + 'A': 0b00, + 'C': 0b01, + 'G': 0b10, + 'T': 0b11, + } + def __init__(self, gene): + self.gene = gene + self.compres() - def _compress(self, gene: str) -> None: - self.bit_string: int = 1 # start with sentinel - for nucleotide in gene.upper(): - self.bit_string <<= 2 # shift left two bits - if nucleotide == "A": # change last two bits to 00 - self.bit_string |= 0b00 - elif nucleotide == "C": # change last two bits to 01 - self.bit_string |= 0b01 - elif nucleotide == "G": # change last two bits to 10 - self.bit_string |= 0b10 - elif nucleotide == "T": # change last two bits to 11 - self.bit_string |= 0b11 - else: - raise ValueError("Invalid Nucleotide:{}".format(nucleotide)) + def compres(self): + self.bit_string = 1 + for n in self.gene.upper(): + self.bit_string <<= 2 + self.bit_string |= self.MAP[n] - def decompress(self) -> str: - gene: str = "" - for i in range(0, self.bit_string.bit_length() - 1, 2): # - 1 to exclude sentinel - bits: int = self.bit_string >> i & 0b11 # get just 2 relevant bits - if bits == 0b00: # A - gene += "A" - elif bits == 0b01: # C - gene += "C" - elif bits == 0b10: # G - gene += "G" - elif bits == 0b11: # T - gene += "T" - else: - raise ValueError("Invalid bits:{}".format(bits)) - return gene[::-1] # [::-1] reverses string by slicing backwards + def decompress(self): + """ + At the end of compress method, we would save an integer number + in self.bit_string. There is this 'bin' function in python, by which you could pass + an integer number and it would return the binary representation of that number, like: + bin(12) + >>'0b1100' + So, we could solve the decompress by using this function, just skip the + first 3 characters + string = bin(self.bit_string)[3:] + now, we can split the string by two characters and each group represents a nucleotide. + """ + reverse_map = {v: k for k, v in self.MAP.items()} + gene = '' + for i in range(0, self.bit_string.bit_length() -1, 2): + bits = self.bit_string >> i & 0b11 + gene += reverse_map[bits] + return gene[::-1] def __str__(self) -> str: # string representation for pretty printing return self.decompress() @@ -61,4 +46,4 @@ def __str__(self) -> str: # string representation for pretty printing compressed: CompressedGene = CompressedGene(original) # compress print("compressed is {} bytes".format(getsizeof(compressed.bit_string))) print(compressed) # decompress - print("original and decompressed are the same: {}".format(original == compressed.decompress())) \ No newline at end of file + print("original and decompressed are the same: {}".format(original == compressed.decompress()))