// CSI 2131 Assignment 1 Part 1.2 Compressing a genome file // Winter 2006 - Prof. L. Moura // Solution by TA Naim R. El-Far (naim@discover.uottawa.ca) // File: compress.cpp // Desc: Driver file containing the whole project's source code // Func: This program, all contained in this cpp file, reads a text file with a sequence over the alphabet // {A, C, G, T} and compresses the file into a binary file built with the mapping A->00, C->01, G->10, // and T->11. #include #include #include using namespace std; //------------------------------------------------------------------------------------------------------------------ // Function name: Encode // Parameters: A character charToEncode, and the byte (passed by referece) where to encode it. // Returns: Nothing // Functionality: The following function takes a character and a pointer to a byte where the given character is // to be encoded into. The alphabet is a 4-letter one consisting of possibilities A, C, G, // and T, each corresponding respectively to the decimal numbers 0, 1, 2, and 3. The function // writes to the given byte irrespective of any other considerations (e.g. already filled byte, // etc) which are taken care of elsewhere. //------------------------------------------------------------------------------------------------------------------ void encode(char charToEncode, unsigned char& byte) { byte = byte << 2; if (charToEncode == 'A') { byte = byte + 0; } else if (charToEncode == 'C') { byte = byte + 1; } else if (charToEncode == 'G') { byte = byte + 2; } else if (charToEncode == 'T') { byte = byte + 3; } else { cerr << endl << "------------------------------------------------------" << endl << "Error! Encoucntered a letter other than A, C, G, or T!" << endl << "------------------------------------------------------" << endl << endl; exit(1); } } //---------------------------------------------------------------------------------------------------------- // Function name: printByte // Parameters: A byte c // Returns: Nothing // Functionality: Prints out the byte's character, decimal, and binary representation. Explained in lab 3. //---------------------------------------------------------------------------------------------------------- void printByte(unsigned char c) { cout << "Byte value: " << c << endl; cout << "Byte decimal value: " << (int)c << endl; cout << "Byte binary representation: "; unsigned char mask=128; for (int i=7; i>=0; i--) { unsigned char bit = (c&mask)>>i; cout << (int)bit; mask = mask >> 1; } cout << endl << endl; } //---------------------------------------------------------------------------------------------------------- // Function name: FormatNumberByte // Parameters: A byte by reference and the number of characters in that byte // Returns: Nothing // Functionality: Formats the byte so that it looks like for example TT-- instead of --TT. Note that the // numOfChars passed could never be 0, since this function is called from within // function that checks for 0 beforehand (if numOfChars = 0, then the bits will be // shufted by 4*2 = 8, i.e. the byte will become all 0's). //---------------------------------------------------------------------------------------------------------- void formatNumberByte(unsigned char& byte, int numOfChars) { byte = byte << 2*(4 - numOfChars); } //------------- //Main function //------------- int main() { //Print welcome message //--------------------- cout << "CSI 2131 Asst 1 Part 1.2" << endl << "Solution by TA Naim R. El-Far (naim@discover.uottawa.ca)" << endl << endl; //Prompt user to input genome file name (default: genome1.txt) //------------------------------------------------------------ cout << "Please enter genome file name (default: genome1.txt): "; string genomeFileName = ""; getline(cin, genomeFileName, '\n'); if (genomeFileName == "") genomeFileName = "genome1.txt"; //Open genome file //---------------- fstream genomeFile(genomeFileName.c_str(), ios::in | ios::out); if (!genomeFile) { cerr << endl << "-------------------------------------------------" << endl << "Error! Specified genome file could not be opened!" << endl << "-------------------------------------------------" << endl << endl; return 1; } //Prompt user to input compressed file name (default: genome1.cmp) //----------------------------------------------------------------- cout << "Please enter lowcomp file name (default: genome1.cmp): "; string compFileName = ""; getline(cin, compFileName, '\n'); if (compFileName == "") compFileName = "genome1.cmp"; //Create compressed file //---------------------- fstream compFile(compFileName.c_str(), ios::out | ios::binary); if (!compFile) { cerr << endl << "-----------------------------------------------------" << endl << "Error! Specified compressed file could not be opened!" << endl << "-----------------------------------------------------" << endl << endl; return 1; } //Compression algorithm //--------------------- char charFromFile = ' '; //This variable will hold the character we read from the genome file unsigned char byteToWrite = 0; //This variable will hold the byte that we will write to the compressed file int charCounter = 0; //This variable will hold the number of character we wrote to the byte so far while (genomeFile.get(charFromFile)) { //As long as we are successfully reading characters from the file ... encode(charFromFile, byteToWrite); //... encode the read character into the byteToWrite variable, and charCounter = (charCounter + 1)%4; //... increment our modulo-4 counter, and if (charCounter == 0) { //... if we have encoded 4 characters, //printByte(byteToWrite); compFile.write((char*)&byteToWrite, 1); //... write the encoded byte to the file, and byteToWrite = 0; //... reset the byte } } if (charCounter != 0) { //If there are characters that have been read but not written yet, formatNumberByte(byteToWrite, charCounter);//... format it so it would look like TT-- and not --TT //printByte(byteToWrite); compFile.write((char*)&byteToWrite, 1); //... write the encoded byte to the file, and then byteToWrite = charCounter; //... set it to however many characters are in the byte before the last } if (charCounter == 0) { charCounter = 4; //If the last encoded byte is full (i.e. of 4 characters), then we have 4 encoded characters, and not 0 byteToWrite = charCounter; //... set it to however many characters are in the byte before the last } //printByte(byteToWrite); compFile.write((char*)&byteToWrite, 1); //... write one last byte that has the number of characters encoded in the byte before the last //Housekeeping //------------ genomeFile.close(); //Closing a file is not necessary since the fstream destructor does that for you compFile.close(); //but it's a good programming habit //Successful termination of main //------------------------------ return 0; }