// FilterChain.hpp // // (C) 2002-2009 MicroNeil Research Corporation // // This is the base class header for FilterChain objects. // FilterChain objects can be chained together to filter // a byte stream. Each object produces a single character // per call. It will also call it's source object for the // next character as required. // History... // 20060822 _M // Adding FilterChainHeaderAnalysis to identify missing headers and header // anomalies, and to extract and test IP data. // 20060127 _M // Added FilterChainCBFG to accept a buffer of a specific // length. // 20041116 _M Added UrlDecode module. The module will repeat a decoded version of // any anchor tag that it sees which contains decodable %xx bytes. Other anchor // tags are not repeated. // 20041116 _M Upgrades to the Defunker module. The module now decodes any HTML // encoded bytes that could have been normal ascii. // 20041114 _M Completed basic defunker engine which strips out all HTML and some // basic   encoding. // 20041113 _M Began heavy upgrades to this module to improve performance and // provide additional obfuscation removal. This modification will include a move // from the use of switch(State) mechanisms to the use of function pointers. This // should save a few cycles on every byte processed. // 20021025 _M // Added FilterChainCString to accept a Null Terminated // String (CString). Except for the input form it operates // exactly like the FilterChainInput form as modified below. // This allows WebClay to deliver the message using a buffer // rather than a file. // 20021015 _M // Modified FilterChainInput to eat control characters and // bytes so that the input stream "appears" always to // be terminated in the *nix standard \n. Tabs are also passed // but all other low bytes are eaten. // 20020721 _M File Created. // This is the base class - nothing special happens here // except defining the basic format of a FilterChain object. // If this object is instantiated, then it will simply return // it's source's data, or a stream of '0's if none has been // defined. #ifndef _MN_FilterChain #define _MN_FilterChain #include #include #include #include #include #include #include using namespace std; // Define parameters for this module. const static int ScanBufferSize = 128; // Define the buffer size. // Define the base class. class FilterChain { private: FilterChain* Source; // Where we get our data. public: class BadSource : public invalid_argument { // Bad Source Exception. public: BadSource(const string& w):invalid_argument(w){} }; class Empty : public underflow_error { // Empty Exception. public: Empty(const string& w):underflow_error(w){} }; virtual unsigned char GetByte() { // Return either 0 if(NULL==Source) return 0; // if we have no source else return Source->GetByte(); // otherwise it's byte. } FilterChain(){Source=NULL;} // Default Constructor no source. // The next constructor throws an error if no source is defined. FilterChain(FilterChain* S) { if(NULL==S) throw BadSource("FilterChain: NULL source not valid"); else Source = S; } virtual ~FilterChain() {} // Stop Warns about no virt dtor }; // FilterChainInput // This version of FilterChain accepts an istream as a source and // gets a single character from it at each GetByte(); class FilterChainInput : public FilterChain { private: istream* SourceIstream; public: // Here we overload the GetByte() function to get a byte // from the source stream. This is a litle bit special because // we're going to start our filtering process. Since we are // filtering text streams for pattern matching systems we will // eat any special control characters we get - including . // This helps us standardize on a *nix model for line ends as // each line end will be \n. It also gets rid of a lot of junk. unsigned char GetByte() { // Get the next byte. char i; // Keep it here. do{ // Loop to eat junk. SourceIstream->get(i); // Read the next byte... if(!SourceIstream->good()) // If something went wrong then throw Empty("FilterChain: No more data"); // throw the empty exception. if(i >= ' ') break; // Send all good bytes right away. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it. // Otherwise quietly eat anything } while(true); // less than a space. return i; // Return the latest byte... } // Here we overload the constructor to accept a stream. FilterChainInput(istream* S){ // Build me with a stream. if(NULL==S) throw BadSource("FilterChainInput: Null source not valid" ); // If it's NULL that's bad. if(!S->good()) throw BadSource("FilterChainInput: Bad istream"); // Not good is bad. else SourceIstream = S; // If it's good we keep it. } FilterChainInput() { // If we don't have a source then throw BadSource("FilterChainInput: Source required"); // we're no good. } }; // FilterChainCString // This version sources the data for the chain from a message buffer, or // more precisely a null terminated string. The basic operation is identical // to that of FilterChainInput above except that we're not working with // a filestream as an input. class FilterChainCString : public FilterChain { private: unsigned char* InputBuffer; int BufferIndex; public: // Here we overload GetByte() just like we do in FilterChainInput // except that we're going to get our data from a NULL terminated // string instead of a stream. IN FACT ... the code below was simply // copied from FilterChainInput and modified in place. unsigned char GetByte() { // Get the next byte. unsigned char i; // Keep it here. do{ // Loop to eat junk. i = InputBuffer[BufferIndex++]; // Read the next byte... if(0 == i) // If there's nothing left then throw Empty("FilterChainCString: No more data"); // throw the empty exception. if(i >= ' ') break; // Send all good bytes right away. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it. // Otherwise quietly eat anything } while(true); // less than a space. return i; // Return the latest byte... } // Here we overload the constructor to accept a stream. FilterChainCString(unsigned char* S){ // Build me with a char buffer. if(NULL==S) throw BadSource("FilterChainCString: NULL source not valid"); // If it's NULL that's bad. if(0==S[0]) throw BadSource("FilterChainCString: Empty source not valid"); // Empty is bad. else InputBuffer = S; // If it's good we keep it. BufferIndex = 0; // Always start at index 0. } FilterChainCString() { // If we don't have a source then throw BadSource("FilterChainCString: Source required"); // we're no good. } }; // FilterChainCBFR // This version sources the data for the chain from a message buffer, NOT // a null terminated string. The basic operation is identical to FilterChainCString // except that this version requires the length of the buffer and stops when that // number of characters have been read. class FilterChainCBFR : public FilterChain { private: unsigned char* InputBuffer; unsigned int BufferLength; unsigned int BufferIndex; stringstream& PrependedHeaders; bool PrependNotBuffer; public: // Here we overload GetByte() just like we do in FilterChainInput // except that we're going to get our data from a known length char // buffer instead of a stream. IN FACT ... the code below was simply // copied from FilterChainCString and modified in place. unsigned char GetByte() { // Get the next byte. unsigned char i; // Keep it here. if(PrependNotBuffer) { // While in prepend mode: if(BufferIndex < PrependedHeaders.str().length()) { // If there is more to get i = PrependedHeaders.str().at(BufferIndex); // then get it and move ++BufferIndex; // the index. } else { // As soon as we run out PrependNotBuffer = false; // of prepended headers switch BufferIndex = 0; // to the CBFR and reset the index. return GetByte(); // Recurse to get the next byte. } } else { // While in buffer mode: do{ // Loop to eat junk. if(BufferLength <= BufferIndex) // If there's nothing left then throw Empty("FilterChainCBFR: No more data"); // throw the empty exception. i = InputBuffer[BufferIndex++]; // Read the next byte... if(i >= ' ') break; // Send all good bytes right away. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it. // Otherwise quietly eat anything } while(true); // less than a space. } return i; // Return the latest byte... } // Here we overload the constructor to accept a stream. FilterChainCBFR(unsigned char* S, int l, stringstream& P) : // Give me a bfr and a stringstream. InputBuffer(S), // Grab the buffer, BufferLength(l), // Grab the buffer length, BufferIndex(0), // Initialize the index to 0, PrependedHeaders(P), // Grab the PrependedHeaders reference. PrependNotBuffer(true) { // Do PrependedHeaders first. if(NULL==S) throw BadSource("FilterChainCBFR: NULL source not valid"); // If it's NULL that's bad. if(0==l && 0==P.str().length()) throw BadSource("FilterChainCBFR: Empty source not valid"); // Empty is bad. } }; // FilterChainBase64 // This version decodes base64 content in email messages. It begins // to decode this as soon as it sees the following message and two // blank lines indicating the coding has started. // // Content-Transfer-Encoding: base64 // // Once it sees a bad character or what appears to be the start of // a new MIME segment, the filter turns off and passes through it's // source data. // The startup string for this filter is below. In this case we keep the // part of the string to ensure we will be looking at the start // of a line when we match. const static unsigned char Base64Start[] = "\nContent-Transfer-Encoding: base64"; // The following table makes conversion fast because it's all lookups. The // special value XX64 is used everywhere a bad byte is found in the table. const static unsigned char XX64 = 0xFF; // Note the special case '=' is used for pad. It is given the value 0x00. // The input to this table is the incoming byte. The output is either XX64 // or a valid base64 numerical value. const static unsigned char Base64Table[256] = { // 0 1 2 3 4 5 6 7 8 9 A B C D E F XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 0 XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 1 XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,0x3E,XX64,XX64,XX64,0x3F, // 2 0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,XX64,XX64,XX64,0x00,XX64,XX64, // 3 XX64,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E, // 4 0x0F,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,XX64,XX64,XX64,XX64,XX64, // 5 XX64,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28, // 6 0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,0x30,0x31,0x32,0x33,XX64,XX64,XX64,XX64,XX64, // 7 XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 8 XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 9 XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // A XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // B XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // C XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // D XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // E XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64 // F }; // The following constants are used to find segment positions when converting from // 4 six bit values to 3 octets. const static unsigned char base64_seg0_shift = 18; const static unsigned char base64_seg1_shift = 12; const static unsigned char base64_seg2_shift = 6; const static unsigned char base64_seg3_shift = 0; class FilterChainBase64 : public FilterChain { private: unsigned char x,y; // We need a few holding bins. unsigned int Workspace; // Numerical workspace for conversion. enum FilterState { // Operating State Codes. SCANNING, // One-in = One-out, looking for startup. DEQUEING, // Delivering buffered data. DECODING // Delivering filtered data. } State; unsigned int ScanIx; // Scanning Index. unsigned int DequeIx; // Dequeing Index. unsigned char Buffer; // Define a buffer. bool ValidByte(unsigned char y); // True if y can be decoded. public: unsigned char GetByte(); // Overload the main fn(). FilterChainBase64(FilterChain* S) // Sourced constructor... :FilterChain(S){ // Call the base constructor. State = SCANNING; // Set filter inactive. ScanIx=DequeIx=0; // Reset our indexes. } // We're all ready to start. FilterChainBase64() { // Don't allow any throw BadSource("FilterChainBase64: Source required"); // null constructors. } }; // FilterChainQuotedPrintable // This version decodes quoted-printable content in email messages. // // For simplicity this one is always on. That is, whenever it sees a // convertable quoted printable byte it will exchange it for the byte // that is represented. This is only intended for operation preceeding the // spam filter engine so it is safe to make these conversions. class FilterChainQuotedPrintable : public FilterChain { private: long int Workspace; // Plain Text Workspace. enum FilterState { // Operating State Codes SCANNING, // One-in = One-out - looking for start-up. DEQUEING, // Delivering buffered data. DECODING // Delivering filtered data. } State; int BufferLength; // How full is the buffer. int BufferIndex; // What byte are we on? unsigned char Buffer[ScanBufferSize]; // Define the buffer. bool isHexDigit(unsigned char i); // true if i is a hex digit byte. int convertHexDigit(unsigned char i); // returns integer value of hex digit i. public: unsigned char GetByte(); // Overload the main fn(). FilterChainQuotedPrintable(FilterChain* S) // Sourced constructor... :FilterChain(S){ // Call the base constructor. State = SCANNING; // Set to the initial state. BufferIndex = 0; // Initial buffer index. BufferLength = 0; // Initial buffer length. Workspace = 0; // Clear the workspace. } FilterChainQuotedPrintable() { // Don't allow any throw BadSource("FilterChainQuotedPrintable: Source required"); // null constructors. } }; // FilterChainDefunker // This module stores a copy of the stream containing HTML and then emits it // at the end of the stream with all of the html elements removed and/or decoded // to eliminate html based obfuscation. class FilterChainDefunker; static const int DefunkerSize = 32768; // Store size. static const int DefunkerQueueSize = 24; // Size of defunker queue. static const char* DefunkerPreamble = " ----[DEFUNKER]---- "; // Patterns to match static const char* patMatchBR = "
"; static const char* patMatchP = "

"; static const char* patNBSP = " "; static const char* patAMP = "&"; static const char* patAPOS = "'"; static const char* patLT = "<"; static const char* patGT = ">"; static const char* patQUOT = """; class FilterChainDefunker : public FilterChain { // Class definition. private: unsigned char StoreBuffer[DefunkerSize]; int InputPosition; int OutputPosition; // Nodes in the state change model are represented by functions. // These modes represent the state prior to getting the Empty exception. // During this mode, the Defunker simply stores a portion of the message // to be scanned later. unsigned char LastRawByte; // Last Raw Byte (for SkipHeaders); unsigned char SkipHeaders(); // Skips the headers before Store(); unsigned char Store(); // Stores the message content for later. // Here is a handy Queue mechanism for recovering failed patterns. int QueueLength; // Queue Length (write position). int QueuePosition; // Queue Read Position. unsigned char Qbfr[DefunkerQueueSize]; // Queue Buffer. void ClearQueue() { // Clear the queue. memset(Qbfr,0,sizeof(Qbfr)); // Reset the buffer. QueueLength = 0; // Zero the length. QueuePosition = 0; // Zero the position. } unsigned char DeQueue() { // Empty the queue then back to DefunkRoot. if(QueuePosition >= QueueLength) { // If the queue is empty then ClearQueue(); // clear the queue, Internal = &FilterChainDefunker::DefunkRoot; // go back to DefunkRoot mode, return GetInternal(); // and return the next byte. } // If the queue is not empty then return Qbfr[QueuePosition++]; // return the next byte from the queue. } void EnQueue(unsigned char x) { // Add a byte to the queue. if(QueueLength unsigned char MatchP(); // Matching

unsigned char MatchNBSP(); // Matching &nbps; unsigned char SwitchAMPAPOS(); // Looking for AMP or APOS. unsigned char MatchAMP(); // Matching & unsigned char MatchAPOS(); // Matching ' unsigned char MatchLT(); // Matching < unsigned char MatchGT(); // Matching > unsigned char MatchQUOT(); // Matching " unsigned char EatTag(); // Eating an unknown tag. unsigned char DecodeNum(); // Decoding &#...number...; // Part of defunking is to convert all runs of whitespace into a single space. // It also doubles as the master output function once we're out of Store() mode. unsigned char SpaceConvChart[256]; // Space conversion chart. unsigned char LastReadOut; // Last ReadOut byte (for deduping spaces). unsigned char ReadOut(); // Read out the store through the filter. unsigned char LastGetStore; // Last GetStore byte (for EatTag). unsigned char GetStore(); // Read a byte from the store. // Here is a handy pattern match function for eliminating some tags. bool MatchTagPattern(const char* pattern) { // Matches pattern. True if matched. int pos = 2; // Now on the third byte (index 2). while(pattern[pos]){ // While we have more bytes to match unsigned char x = GetStore(); // grab the next byte. // Special case - HTML tag with a space as in

if(x==' ' && pattern[pos]=='>') { // If we have a tag with parameters. pos++; // Move pos forward to it's null. while(GetStore()!='>')continue; // Eat up to the > and then break; // we are done. } // In the normal case follow the pattern. if(tolower(x)!=pattern[pos]) break; // If we fell off then stop. pos++; // If we didn't break move ahead. } // At this point we are either at the null in our pattern or we did not match. if(pattern[pos]) { return false; } // If we're not at the end then no match. return true; // Otherwise we do have a match :-) } // These are the function pointers that map the current state of this object. unsigned char (FilterChainDefunker::*Master)(); // Master function for GetByte() unsigned char (FilterChainDefunker::*Internal)(); // Internal function for GetByte() public: unsigned char GetByte() { // Overload the main fn(). return (*this.*Master)(); // Call the master function. } unsigned char GetInternal() { // Internal state machine get. return (*this.*Internal)(); // Call the internal function. } FilterChainDefunker(FilterChain* S) // Sourced constructor... :FilterChain(S), // Call the base constructor. InputPosition(0), // Reset both position pointers. OutputPosition(0), LastRawByte(0), LastReadOut(0), LastGetStore(0), Master(&FilterChainDefunker::SkipHeaders), // Set the initial external and Internal(&FilterChainDefunker::Preamble) { // internal states. ClearQueue(); // Clear the queue; memset(StoreBuffer,0,sizeof(StoreBuffer)); // Clear the store buffer. for(int i=0;i<256;i++) SpaceConvChart[i]=i; // Initialize the chart. SpaceConvChart[(int)'\r']=' '; // Convert to space. SpaceConvChart[(int)'\n']=' '; // Convert to space. SpaceConvChart[(int)'\t']=' '; // Convert Tab to space. } FilterChainDefunker() { // Don't allow any throw BadSource("FilterChainDefunker: Source required"); // null constructors. } }; // FilterChainUrlDecode // This module removes any unnecessary URL encoding within an tag. The // cleaned up version (if different) is emitted immediately after the original // tag so that both versions can be interpreted by the pattern scanner. // This is designed to eliminate common obfuscation techniques. const int UrlDecodeBfrSize = 256; // Decode Buffer Size. class FilterChainUrlDecode : public FilterChain { private: unsigned char DecodeBfr[UrlDecodeBfrSize]; // Decoded anchor buffer. unsigned int DecodeLength; // Decoded anchor length. unsigned int DecodePosition; // Read (Inject) Position. bool DecodeFlag; // True if the URL was decoded. void Clear() { // Function to clear the bfr. memset(DecodeBfr,0,sizeof(DecodeBfr)); // Null it out and set DecodeLength = 0; // the length to zero. DecodePosition = 0; // Reset the Read position. DecodeFlag = false; // Reset the Decode Flag. } void AddToBfr(unsigned char c) { // Safely add to our buffer. if(DecodeLength < sizeof(DecodeBfr)-1) // If we have more room then DecodeBfr[DecodeLength++] = c; // write the incoming byte. } unsigned char (FilterChainUrlDecode::*Internal)(); // Internal State Fn bool isHexDigit(unsigned char i); // Is i a hex digit? int convertHexDigit(unsigned char i); // Convert a single hex digit. unsigned char convertHexByte(unsigned char* x); // Convert a hex byte. // Here are the states of the UrlDecode module... unsigned char Bypass(); // Bypass - waiting for '<' unsigned char Tag(); // Looks for an 'a' or 'i' after '<' unsigned char Img1(); // Looks for 'm' in