// CSI 2131 Assignment 1 Part 1.2 Decompressing a compressed genome file // Winter 2006 - Prof. L. Moura // Solution by TA Naim R. El-Far (naim@discover.uottawa.ca) // File: uncompress.cpp // Desc: Driver file containing the whole project's source code // Func: This program, all contained in this cpp file, reads a binary file with a sequence over 2-bit binary // alphabet with the mapping A<-00, C<-01, G<-10, and T<-11, decompresses the binary characters into their // textual representation as per the previous mapping, and writes the decompressed output to a text file. #include #include #include using namespace std; //------------------------------------------------------------------------------------------------------------------ // Function name: Decode // Parameters: A character charToDecode, and a pointer to a character array // Returns: Nothing // Functionality: The following function takes a decoded character and a pointer to a character array where the // given binary character is to be decoded into. Each binary character holds 4 textual decoded // characters over an alphabet that is a 4-letter one consisting of possibilities A, C, G, // and T, each corresponding respectively to the decimal numbers 0, 1, 2, and 3. The function // writes to the given storageArray irrespective of any other considerations which are taken // care of elsewhere. //------------------------------------------------------------------------------------------------------------------ void Decode(unsigned char charToDecode, char* ptrToStorageArray) { ptrToStorageArray[0] = (charToDecode & 192) >> 6; //Decoding the first character (two left-most bits) by anding with //binary 11 00 00 00, which is decimal 192 to yield 0, 1, 2, or 3 //and then shifting to put the bits in the first two spots ptrToStorageArray[1] = (charToDecode & 48) >> 4; //Decoding the second character (second 2-left-most bits) by anding //with binary 00 11 00 00, which is decimal 48 to yield 0, 1, 2, or 3 //and then shifting to put the bits in the first two spots ptrToStorageArray[2] = (charToDecode & 12) >> 2; //Decoding the third character (third 2-left-most bits) by anding with //binary 00 00 11 00, which is decimal 12 to yield 0, 1, 2, or 3 //and then shifting to put the bits in the first two spots ptrToStorageArray[3] = (charToDecode & 3); //Decoding the last character (2-right-most bits) by anding with //binary 11 00 00 00, which is decimal 3 to yield 0, 1, 2, or 3 for (int i = 0 ; i < 4 ; i++) { //Now, translate the values 0, 1, 2, and 3 to the characters A, C, G, and T if (ptrToStorageArray[i] == 0) ptrToStorageArray[i] = 'A'; else if (ptrToStorageArray[i] == 1) ptrToStorageArray[i] = 'C'; else if (ptrToStorageArray[i] == 2) ptrToStorageArray[i] = 'G'; else if (ptrToStorageArray[i] == 3) ptrToStorageArray[i] = 'T'; else { cerr << endl << "---------------------------------------------------" << endl << "Error! Encountered a character not in the alphabet!" << endl << "---------------------------------------------------" << endl << endl; exit(1); } } } //------------- //Main function //------------- int main() { //Print welcome message //--------------------- cout << "CSI 2131 Asst 1 Part 1.3" << endl << "Solution by TA Naim R. El-Far (naim@discover.uottawa.ca)" << endl << endl; //Prompt user to input compressed file name (default: genome2.cmp) //---------------------------------------------------------------- cout << "Please enter compressed file name (default: genome2.cmp): "; string compFileName = ""; getline(cin, compFileName, '\n'); if (compFileName == "") compFileName = "genome2.cmp"; //Open genome file //---------------- fstream compFile(compFileName.c_str(), ios::in | ios::binary); if (!compFile) { cerr << endl << "-----------------------------------------------------" << endl << "Error! Specified compressed file could not be opened!" << endl << "-----------------------------------------------------" << endl << endl; return 1; } //Prompt user to input uncompressed file name (default: genome1.cmp) //------------------------------------------------------------------ cout << "Please enter uncompressed file name (default: genome2.ucp): "; string uncompFileName = ""; getline(cin, uncompFileName, '\n'); if (uncompFileName == "") uncompFileName = "genome2.ucp"; //Create uncompressed file //------------------------ fstream uncompFile(uncompFileName.c_str(), ios::out); if (!uncompFile) { cerr << endl << "-------------------------------------------------------" << endl << "Error! Specified uncompressed file could not be opened!" << endl << "-------------------------------------------------------" << endl << endl; return 1; } //Decompressing algorithm //----------------------- compFile.seekg(0, ios::end); //Place the file pointer at the end of the compressed file ... int sizeOfCompressedFile = compFile.tellg(); //... and get the file pointer's position which corresponds to the file size compFile.seekg(0); //... then return to the beginning of the file char charFromFile = ' '; //Holds the character read from the binary file char decodedChars[4] = {' ', ' ', ' ', ' '}; //Holds the decoded binary character (i.e. 4 text characters) for (int j = 0 ; j < sizeOfCompressedFile - 2 ; j++) { //For all characters up until two before the end of the file (i.e. for all characters in the compressed file with the exception of the last encoded byte, and the byte that holds the number of characters in the last encoded byte) compFile.get(charFromFile); //...get the character from the file, Decode(charFromFile, decodedChars); //...decode it, for (int i = 0 ; i < 4 ; i++) { //...and output it cout << decodedChars[i]; uncompFile << decodedChars[i]; } } //At this point we are at the 3rd character from the EOF (including the EOF char) char lastLetters = ' '; //Holds the binary character that will represent the last letters compressed compFile.get(lastLetters); //Get the binary character that holds the last compressed letters char numOfLetters = ' '; //Holds the number of characters as per the compression specs compFile.get(numOfLetters); //Get the binary character that holds the number of characters compressed in the last binary character Decode(lastLetters, decodedChars); //Decode for (int i = 0 ; i < numOfLetters ; i++) { //Output only the compressed letters ignoring those that are only placeholders cout << decodedChars[i]; uncompFile << decodedChars[i]; } cout << endl; //Housekeeping //------------ compFile.close(); //Closing a file is not necessary since the fstream destructor does that for you uncompFile.close(); //but it's a good programming habit //Successful termination of main //------------------------------ return 0; }