123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768 |
- // FilterChain.hpp
- //
- // (C) 2002-2009 MicroNeil Research Corporation
- //
- // This is the base class header for FilterChain objects.
- // FilterChain objects can be chained together to filter
- // a byte stream. Each object produces a single character
- // per call. It will also call it's source object for the
- // next character as required.
-
- // History...
-
- // 20060822 _M
- // Adding FilterChainHeaderAnalysis to identify missing headers and header
- // anomalies, and to extract and test IP data.
-
- // 20060127 _M
- // Added FilterChainCBFG to accept a buffer of a specific
- // length.
-
- // 20041116 _M Added UrlDecode module. The module will repeat a decoded version of
- // any anchor tag that it sees which contains decodable %xx bytes. Other anchor
- // tags are not repeated.
-
- // 20041116 _M Upgrades to the Defunker module. The module now decodes any HTML
- // encoded bytes that could have been normal ascii.
-
- // 20041114 _M Completed basic defunker engine which strips out all HTML and some
- // basic encoding.
-
- // 20041113 _M Began heavy upgrades to this module to improve performance and
- // provide additional obfuscation removal. This modification will include a move
- // from the use of switch(State) mechanisms to the use of function pointers. This
- // should save a few cycles on every byte processed.
-
- // 20021025 _M
- // Added FilterChainCString to accept a Null Terminated
- // String (CString). Except for the input form it operates
- // exactly like the FilterChainInput form as modified below.
- // This allows WebClay to deliver the message using a buffer
- // rather than a file.
-
- // 20021015 _M
- // Modified FilterChainInput to eat control characters and
- // <CR> bytes so that the input stream "appears" always to
- // be terminated in the *nix standard \n. Tabs are also passed
- // but all other low bytes are eaten.
-
- // 20020721 _M File Created.
-
- // This is the base class - nothing special happens here
- // except defining the basic format of a FilterChain object.
- // If this object is instantiated, then it will simply return
- // it's source's data, or a stream of '0's if none has been
- // defined.
-
- #ifndef _MN_FilterChain
- #define _MN_FilterChain
-
- #include <stdexcept>
- #include <iostream>
- #include <sstream>
- #include <string>
- #include <cstring>
- #include <cstdlib>
- #include <cctype>
-
-
- using namespace std;
-
-
- // Define parameters for this module.
-
- const static int ScanBufferSize = 128; // Define the buffer size.
-
- // Define the base class.
-
- class FilterChain {
-
- private:
-
- FilterChain* Source; // Where we get our data.
-
- public:
-
- class BadSource : public invalid_argument { // Bad Source Exception.
- public: BadSource(const string& w):invalid_argument(w){}
- };
- class Empty : public underflow_error { // Empty Exception.
- public: Empty(const string& w):underflow_error(w){}
- };
-
- virtual unsigned char GetByte() { // Return either 0
- if(NULL==Source) return 0; // if we have no source
- else return Source->GetByte(); // otherwise it's byte.
- }
-
- FilterChain(){Source=NULL;} // Default Constructor no source.
-
- // The next constructor throws an error if no source is defined.
-
- FilterChain(FilterChain* S) {
- if(NULL==S) throw BadSource("FilterChain: NULL source not valid");
- else Source = S;
- }
- };
-
- // FilterChainInput
- // This version of FilterChain accepts an istream as a source and
- // gets a single character from it at each GetByte();
-
- class FilterChainInput : public FilterChain {
-
- private:
-
- istream* SourceIstream;
-
- public:
-
- // Here we overload the GetByte() function to get a byte
- // from the source stream. This is a litle bit special because
- // we're going to start our filtering process. Since we are
- // filtering text streams for pattern matching systems we will
- // eat any special control characters we get - including <CR>.
- // This helps us standardize on a *nix model for line ends as
- // each line end will be \n. It also gets rid of a lot of junk.
-
- unsigned char GetByte() { // Get the next byte.
- char i; // Keep it here.
-
- do{ // Loop to eat junk.
-
- SourceIstream->get(i); // Read the next byte...
- if(!SourceIstream->good()) // If something went wrong then
- throw Empty("FilterChain: No more data"); // throw the empty exception.
-
- if(i >= ' ') break; // Send all good bytes right away.
- if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
- // Otherwise quietly eat anything
- } while(true); // less than a space.
-
- return i; // Return the latest byte...
- }
-
- // Here we overload the constructor to accept a stream.
-
- FilterChainInput(istream* S){ // Build me with a stream.
- if(NULL==S) throw BadSource("FilterChainInput: Null source not valid" ); // If it's NULL that's bad.
- if(!S->good()) throw BadSource("FilterChainInput: Bad istream"); // Not good is bad.
- else SourceIstream = S; // If it's good we keep it.
- }
-
- FilterChainInput() { // If we don't have a source then
- throw BadSource("FilterChainInput: Source required"); // we're no good.
- }
- };
-
- // FilterChainCString
- // This version sources the data for the chain from a message buffer, or
- // more precisely a null terminated string. The basic operation is identical
- // to that of FilterChainInput above except that we're not working with
- // a filestream as an input.
-
- class FilterChainCString : public FilterChain {
-
- private:
-
- unsigned char* InputBuffer;
- int BufferIndex;
-
- public:
-
- // Here we overload GetByte() just like we do in FilterChainInput
- // except that we're going to get our data from a NULL terminated
- // string instead of a stream. IN FACT ... the code below was simply
- // copied from FilterChainInput and modified in place.
-
- unsigned char GetByte() { // Get the next byte.
- unsigned char i; // Keep it here.
-
- do{ // Loop to eat junk.
-
- i = InputBuffer[BufferIndex++]; // Read the next byte...
- if(0 == i) // If there's nothing left then
- throw Empty("FilterChainCString: No more data"); // throw the empty exception.
-
- if(i >= ' ') break; // Send all good bytes right away.
- if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
- // Otherwise quietly eat anything
- } while(true); // less than a space.
-
- return i; // Return the latest byte...
- }
-
- // Here we overload the constructor to accept a stream.
-
- FilterChainCString(unsigned char* S){ // Build me with a char buffer.
- if(NULL==S) throw BadSource("FilterChainCString: NULL source not valid"); // If it's NULL that's bad.
- if(0==S[0]) throw BadSource("FilterChainCString: Empty source not valid"); // Empty is bad.
- else InputBuffer = S; // If it's good we keep it.
- BufferIndex = 0; // Always start at index 0.
- }
-
- FilterChainCString() { // If we don't have a source then
- throw BadSource("FilterChainCString: Source required"); // we're no good.
- }
- };
-
- // FilterChainCBFR
- // This version sources the data for the chain from a message buffer, NOT
- // a null terminated string. The basic operation is identical to FilterChainCString
- // except that this version requires the length of the buffer and stops when that
- // number of characters have been read.
-
- class FilterChainCBFR : public FilterChain {
-
- private:
-
- unsigned char* InputBuffer;
- int BufferIndex;
- int BufferLength;
-
- stringstream& PrependedHeaders;
-
- bool PrependNotBuffer;
-
- public:
-
- // Here we overload GetByte() just like we do in FilterChainInput
- // except that we're going to get our data from a known length char
- // buffer instead of a stream. IN FACT ... the code below was simply
- // copied from FilterChainCString and modified in place.
-
- unsigned char GetByte() { // Get the next byte.
- unsigned char i; // Keep it here.
-
- if(PrependNotBuffer) { // While in prepend mode:
-
- if(BufferIndex < PrependedHeaders.str().length()) { // If there is more to get
- i = PrependedHeaders.str().at(BufferIndex); // then get it and move
- ++BufferIndex; // the index.
- } else { // As soon as we run out
- PrependNotBuffer = false; // of prepended headers switch
- BufferIndex = 0; // to the CBFR and reset the index.
- return GetByte(); // Recurse to get the next byte.
- }
-
- } else { // While in buffer mode:
-
- do{ // Loop to eat junk.
- if(BufferLength <= BufferIndex) // If there's nothing left then
- throw Empty("FilterChainCBFR: No more data"); // throw the empty exception.
-
- i = InputBuffer[BufferIndex++]; // Read the next byte...
-
- if(i >= ' ') break; // Send all good bytes right away.
- if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
- // Otherwise quietly eat anything
- } while(true); // less than a space.
- }
-
- return i; // Return the latest byte...
- }
-
- // Here we overload the constructor to accept a stream.
-
- FilterChainCBFR(unsigned char* S, int l, stringstream& P) : // Give me a bfr and a stringstream.
- InputBuffer(S), // Grab the buffer,
- BufferLength(l), // Grab the buffer length,
- BufferIndex(0), // Initialize the index to 0,
- PrependedHeaders(P), // Grab the PrependedHeaders reference.
- PrependNotBuffer(true) { // Do PrependedHeaders first.
-
- if(NULL==S) throw BadSource("FilterChainCBFR: NULL source not valid"); // If it's NULL that's bad.
- if(0==l && 0==P.str().length())
- throw BadSource("FilterChainCBFR: Empty source not valid"); // Empty is bad.
- }
-
- };
-
- // FilterChainBase64
- // This version decodes base64 content in email messages. It begins
- // to decode this as soon as it sees the following message and two
- // blank lines indicating the coding has started.
- //
- // Content-Transfer-Encoding: base64
- //
- // Once it sees a bad character or what appears to be the start of
- // a new MIME segment, the filter turns off and passes through it's
- // source data.
-
- // The startup string for this filter is below. In this case we keep the
- // <LF> part of the string to ensure we will be looking at the start
- // of a line when we match.
-
- const static unsigned char Base64Start[] = "\nContent-Transfer-Encoding: base64";
-
- // The following table makes conversion fast because it's all lookups. The
- // special value XX64 is used everywhere a bad byte is found in the table.
-
- const static unsigned char XX64 = 0xFF;
-
- // Note the special case '=' is used for pad. It is given the value 0x00.
-
- // The input to this table is the incoming byte. The output is either XX64
- // or a valid base64 numerical value.
-
- const static unsigned char Base64Table[256] = {
-
- // 0 1 2 3 4 5 6 7 8 9 A B C D E F
-
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 0
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 1
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,0x3E,XX64,XX64,XX64,0x3F, // 2
- 0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,XX64,XX64,XX64,0x00,XX64,XX64, // 3
- XX64,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E, // 4
- 0x0F,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,XX64,XX64,XX64,XX64,XX64, // 5
- XX64,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28, // 6
- 0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,0x30,0x31,0x32,0x33,XX64,XX64,XX64,XX64,XX64, // 7
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 8
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 9
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // A
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // B
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // C
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // D
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // E
- XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64 // F
- };
-
- // The following constants are used to find segment positions when converting from
- // 4 six bit values to 3 octets.
-
- const static unsigned char base64_seg0_shift = 18;
- const static unsigned char base64_seg1_shift = 12;
- const static unsigned char base64_seg2_shift = 6;
- const static unsigned char base64_seg3_shift = 0;
-
- class FilterChainBase64 : public FilterChain {
-
- private:
-
- unsigned char x,y; // We need a few holding bins.
- unsigned int Workspace; // Numerical workspace for conversion.
-
- enum FilterState { // Operating State Codes.
- SCANNING, // One-in = One-out, looking for startup.
- DEQUEING, // Delivering buffered data.
- DECODING // Delivering filtered data.
- } State;
-
- int ScanIx; // Scanning Index.
- int DequeIx; // Dequeing Index.
- unsigned char Buffer; // Define a buffer.
-
- bool ValidByte(unsigned char y); // True if y can be decoded.
-
- public:
-
- unsigned char GetByte(); // Overload the main fn().
-
- FilterChainBase64(FilterChain* S) // Sourced constructor...
- :FilterChain(S){ // Call the base constructor.
- State = SCANNING; // Set filter inactive.
- ScanIx=DequeIx=0; // Reset our indexes.
- } // We're all ready to start.
-
- FilterChainBase64() { // Don't allow any
- throw BadSource("FilterChainBase64: Source required"); // null constructors.
- }
-
- };
-
- // FilterChainQuotedPrintable
- // This version decodes quoted-printable content in email messages.
- //
- // For simplicity this one is always on. That is, whenever it sees a
- // convertable quoted printable byte it will exchange it for the byte
- // that is represented. This is only intended for operation preceeding the
- // spam filter engine so it is safe to make these conversions.
-
- class FilterChainQuotedPrintable : public FilterChain {
-
- private:
-
- long int Workspace; // Plain Text Workspace.
- enum FilterState { // Operating State Codes
- SCANNING, // One-in = One-out - looking for start-up.
- DEQUEING, // Delivering buffered data.
- DECODING // Delivering filtered data.
- } State;
-
- int BufferLength; // How full is the buffer.
- int BufferIndex; // What byte are we on?
- unsigned char Buffer[ScanBufferSize]; // Define the buffer.
-
- bool isHexDigit(unsigned char i); // true if i is a hex digit byte.
- int convertHexDigit(unsigned char i); // returns integer value of hex digit i.
-
- public:
-
- unsigned char GetByte(); // Overload the main fn().
-
- FilterChainQuotedPrintable(FilterChain* S) // Sourced constructor...
- :FilterChain(S){ // Call the base constructor.
- State = SCANNING; // Set to the initial state.
- BufferIndex = 0; // Initial buffer index.
- BufferLength = 0; // Initial buffer length.
- Workspace = 0; // Clear the workspace.
- }
-
- FilterChainQuotedPrintable() { // Don't allow any
- throw BadSource("FilterChainQuotedPrintable: Source required"); // null constructors.
- }
-
- };
-
-
- // FilterChainDefunker
- // This module stores a copy of the stream containing HTML and then emits it
- // at the end of the stream with all of the html elements removed and/or decoded
- // to eliminate html based obfuscation.
-
- class FilterChainDefunker;
-
- static const int DefunkerSize = 32768; // Store size.
- static const int DefunkerQueueSize = 24; // Size of defunker queue.
-
- static const char* DefunkerPreamble = " ----[DEFUNKER]---- ";
-
- // Patterns to match
-
- static const char* patMatchBR = "<br>";
- static const char* patMatchP = "<p>";
- static const char* patNBSP = " ";
- static const char* patAMP = "&";
- static const char* patAPOS = "'";
- static const char* patLT = "<";
- static const char* patGT = ">";
- static const char* patQUOT = """;
-
- class FilterChainDefunker : public FilterChain { // Class definition.
-
- private:
-
- unsigned char StoreBuffer[DefunkerSize];
- int InputPosition;
- int OutputPosition;
-
- // Nodes in the state change model are represented by functions.
- // These modes represent the state prior to getting the Empty exception.
- // During this mode, the Defunker simply stores a portion of the message
- // to be scanned later.
-
- unsigned char LastRawByte; // Last Raw Byte (for SkipHeaders);
- unsigned char SkipHeaders(); // Skips the headers before Store();
- unsigned char Store(); // Stores the message content for later.
-
- // Here is a handy Queue mechanism for recovering failed patterns.
-
- int QueueLength; // Queue Length (write position).
- int QueuePosition; // Queue Read Position.
- unsigned char Qbfr[DefunkerQueueSize]; // Queue Buffer.
-
- void ClearQueue() { // Clear the queue.
- memset(Qbfr,0,sizeof(Qbfr)); // Reset the buffer.
- QueueLength = 0; // Zero the length.
- QueuePosition = 0; // Zero the position.
- }
-
- unsigned char DeQueue() { // Empty the queue then back to DefunkRoot.
- if(QueuePosition >= QueueLength) { // If the queue is empty then
- ClearQueue(); // clear the queue,
- Internal = &FilterChainDefunker::DefunkRoot; // go back to DefunkRoot mode,
- return GetInternal(); // and return the next byte.
- } // If the queue is not empty then
- return Qbfr[QueuePosition++]; // return the next byte from the queue.
- }
-
- void EnQueue(unsigned char x) { // Add a byte to the queue.
- if(QueueLength<DefunkerQueueSize) // If we are safely within the buffer
- Qbfr[QueueLength++] = x; // then add this byte to the queue.
- }
-
- // These modes represent the Defunker pulling data out of it's
- // stored copy so that it can be filtered and delivered to the scanner.
- // These modes get turned on once the Empty exception is read from
- // the underlying source.
-
- unsigned char Preamble(); // Preamble - separates Defunked text.
- unsigned char DefunkRoot(); // Root in Defunk mode.
- unsigned char OpenTag(); // Open tag detected.
- unsigned char OpenAmp(); // Open & tag.
- unsigned char MatchBR(); // Matching <br>
- unsigned char MatchP(); // Matching <p>
- unsigned char MatchNBSP(); // Matching &nbps;
- unsigned char SwitchAMPAPOS(); // Looking for AMP or APOS.
- unsigned char MatchAMP(); // Matching &
- unsigned char MatchAPOS(); // Matching '
- unsigned char MatchLT(); // Matching <
- unsigned char MatchGT(); // Matching >
- unsigned char MatchQUOT(); // Matching "
- unsigned char EatTag(); // Eating an unknown tag.
- unsigned char DecodeNum(); // Decoding &#...number...;
-
- // Part of defunking is to convert all runs of whitespace into a single space.
- // It also doubles as the master output function once we're out of Store() mode.
-
- unsigned char SpaceConvChart[256]; // Space conversion chart.
- unsigned char LastReadOut; // Last ReadOut byte (for deduping spaces).
- unsigned char ReadOut(); // Read out the store through the filter.
-
- unsigned char LastGetStore; // Last GetStore byte (for EatTag).
- unsigned char GetStore(); // Read a byte from the store.
-
- // Here is a handy pattern match function for eliminating some tags.
-
- bool MatchTagPattern(const char* pattern) { // Matches pattern. True if matched.
- int pos = 2; // Now on the third byte (index 2).
- while(pattern[pos]){ // While we have more bytes to match
- unsigned char x = GetStore(); // grab the next byte.
-
- // Special case - HTML tag with a space as in <p stuff>
-
- if(x==' ' && pattern[pos]=='>') { // If we have a tag with parameters.
- pos++; // Move pos forward to it's null.
- while(GetStore()!='>')continue; // Eat up to the > and then
- break; // we are done.
- }
-
- // In the normal case follow the pattern.
-
- if(tolower(x)!=pattern[pos]) break; // If we fell off then stop.
- pos++; // If we didn't break move ahead.
- }
-
- // At this point we are either at the null in our pattern or we did not match.
-
- if(pattern[pos]) { return false; } // If we're not at the end then no match.
-
- return true; // Otherwise we do have a match :-)
- }
-
- // These are the function pointers that map the current state of this object.
-
- unsigned char (FilterChainDefunker::*Master)(); // Master function for GetByte()
- unsigned char (FilterChainDefunker::*Internal)(); // Internal function for GetByte()
-
- public:
-
- unsigned char GetByte() { // Overload the main fn().
- return (*this.*Master)(); // Call the master function.
- }
-
- unsigned char GetInternal() { // Internal state machine get.
- return (*this.*Internal)(); // Call the internal function.
- }
-
- FilterChainDefunker(FilterChain* S) // Sourced constructor...
- :FilterChain(S), // Call the base constructor.
- Master(&FilterChainDefunker::SkipHeaders), // Set the initial external and
- Internal(&FilterChainDefunker::Preamble), // internal states.
- InputPosition(0), // Reset both position pointers.
- OutputPosition(0),
- LastReadOut(0),
- LastGetStore(0),
- LastRawByte(0) {
-
- ClearQueue(); // Clear the queue;
-
- memset(StoreBuffer,0,sizeof(StoreBuffer)); // Clear the store buffer.
-
- for(int i=0;i<256;i++) SpaceConvChart[i]=i; // Initialize the chart.
- SpaceConvChart[(int)'\r']=' '; // Convert <CR> to space.
- SpaceConvChart[(int)'\n']=' '; // Convert <LF> to space.
- SpaceConvChart[(int)'\t']=' '; // Convert Tab to space.
- }
-
- FilterChainDefunker() { // Don't allow any
- throw BadSource("FilterChainDefunker: Source required"); // null constructors.
- }
-
- };
-
- // FilterChainUrlDecode
- // This module removes any unnecessary URL encoding within an <a...> tag. The
- // cleaned up version (if different) is emitted immediately after the original
- // <a...> tag so that both versions can be interpreted by the pattern scanner.
- // This is designed to eliminate common obfuscation techniques.
-
- const int UrlDecodeBfrSize = 256; // Decode Buffer Size.
-
- class FilterChainUrlDecode : public FilterChain {
-
- private:
-
- unsigned char DecodeBfr[UrlDecodeBfrSize]; // Decoded anchor buffer.
- int DecodeLength; // Decoded anchor length.
- int DecodePosition; // Read (Inject) Position.
- bool DecodeFlag; // True if the URL was decoded.
-
- void Clear() { // Function to clear the bfr.
- memset(DecodeBfr,0,sizeof(DecodeBfr)); // Null it out and set
- DecodeLength = 0; // the length to zero.
- DecodePosition = 0; // Reset the Read position.
- DecodeFlag = false; // Reset the Decode Flag.
- }
-
- void AddToBfr(unsigned char c) { // Safely add to our buffer.
- if(DecodeLength < sizeof(DecodeBfr)-1) // If we have more room then
- DecodeBfr[DecodeLength++] = c; // write the incoming byte.
- }
-
- unsigned char (FilterChainUrlDecode::*Internal)(); // Internal State Fn
-
- bool isHexDigit(unsigned char i); // Is i a hex digit?
- int convertHexDigit(unsigned char i); // Convert a single hex digit.
- unsigned char convertHexByte(unsigned char* x); // Convert a hex byte.
-
- // Here are the states of the UrlDecode module...
-
- unsigned char Bypass(); // Bypass - waiting for '<'
- unsigned char Tag(); // Looks for an 'a' or 'i' after '<'
- unsigned char Img1(); // Looks for 'm' in <img
- unsigned char Img2(); // Looks for 'g' in <img
- unsigned char Root(); // Root state of the decode FSM.
- unsigned char GetD1(); // Decoding step one.
- unsigned char GetD2(); // Decoding step two.
- unsigned char Inject(); // Injects the bfr into the stream.
-
- public:
-
- unsigned char GetByte() { // Overload the main fn().
- return (*this.*Internal)(); // Call the Internal function.
- }
-
- FilterChainUrlDecode(FilterChain* S) // Sourced constructor...
- :FilterChain(S), // Call the base constructor.
- Internal(&FilterChainUrlDecode::Bypass) { // Set ByPass mode.
- Clear(); // Clear the system.
- }
-
- FilterChainUrlDecode() { // Don't allow any
- throw BadSource("FilterChainUrlDecode: Source required"); // null constructors.
- }
-
- };
-
- // FilterChainHeaderAnalysis (and friends)
- // Performs header anomaly analysis and IP extraction and analysis.
- // IP Analysis is peformed via a provided class that implements the IPTester
- // interface. An IP is provided to the IPTester as a [#.#.#.#] string. The
- // IPTester may respond with information to be emitted into the headers for
- // the pattern matching engine based on those results --- or not ;-)
-
- class FilterChainIPTester {
- public:
- virtual string& test(string& input, string& output) = 0;
- };
-
- // The supplied test() function accepts the input string and returns the
- // output string. If desired, the output string can be modified to include
- // data from the tests that will be emitted into the data stream for the
- // pattern analysis engine to see. Otherwise, the output string should
- // remain blank. The test() function _should_ be thread safe -- that is why
- // we pass it both input and output ;-)
- //
- // The provided tester may have any side-effects that are desired.
-
- class FilterChainHeaderAnalysis : public FilterChain {
-
- private:
-
- unsigned char (FilterChainHeaderAnalysis::*Mode)(); // Internal State Fn Pointer (What Mode)
- FilterChainIPTester& IPTester; // This is the IP tester we use.
- string IPToTest; // String to capture IPs for testing.
- string IPTestResult; // String to receive IPtest results.
-
- // Header analysis output state...
-
- string EndOfHeaderResults; // String to capture EndOfHeaderResults.
-
- // OutputIndex and OutputLength are used to inject string data.
- // These are used to inject IPTestResult data and Header Analysis data.
-
- char* OutputBuffer; // Pointer to output injection string.
- int OutputIndex; // End of header output results index.
- void SetOutputBuffer(string& s); // Setup the OutputBuffer.
- unsigned char doInjectIPTestResult(); // Inject OutputBuffer and go to doSeekNL.
- unsigned char doInjectAnalysis(); // Inject OutputBuffer and go to doOff.
-
- // Header seek pattern state...
- // These tools work to follow patterns for header tags.
- // SetFollowPattern resets the engine and establishes the pattern to follow.
- // FollowPattern checks c against the next byte in the pattern.
- // -1 = The pattern failed.
- // 1 = The pattern was followed.
- // 0 = The pattern is complete.
-
- const char* MatchPattern; // Current pattern to match.
- int MatchIndex; // Pattern match following index.
- void SetFollowPattern(const char* p) { MatchPattern = p; MatchIndex = 0; } // Set the pattern to follow.
- int FollowPattern(char c); // Follow the pattern.
-
- //// Internal modes for this module...
-
- unsigned char doSeekNL(); // Looking for a new line.
- unsigned char doSeekDispatch(); // Looking at the first char after NL.
- unsigned char doReceived(); // Identifying a Received: header.
- unsigned char doFindIP(); // Seeking the [IP] in a Received header.
- unsigned char doTestIP(); // Gets and tests the [IP].
- unsigned char doFrom(); // Identifying a From: header.
- unsigned char doTo(); // Identifying a To: header.
- unsigned char doCC(); // Identifying a CC: header.
- unsigned char doMessageID(); // Identifying a MessageID header.
- unsigned char doDate(); // Identifying a Date: header.
- unsigned char doSubject(); // Identifying a Subject: header.
- unsigned char doEndOfHeaders(); // IdentifyEndOfHeaders & Emit Results.
-
- unsigned char doOff() { return FilterChain::GetByte(); } // Bypass mode.
-
- bool FoundFrom; // True if From: was found.
- bool FoundTo; // True if To: was found.
- bool FoundCC; // True if CC: was found.
- bool FoundMessageID; // True if Message-ID: was found.
- bool FoundDate; // True if Date: was found.
- bool FoundSubject; // True if Subject: was found.
- bool FoundHighBitCharacters; // True if high bit characters were found.
-
- unsigned char GetCheckedByte() { // Internal GetByte & check for high bits.
- unsigned char x = FilterChain::GetByte(); // Get the byte from up the chain.
- if(0 < (x & 0x80)) { // Check for a high bit byte (non-ascii).
- FoundHighBitCharacters = true; // If it is found then set the flag.
- } // If not then at least we checked ;-)
- return x; // Return the byte.
- }
-
- public:
-
- unsigned char GetByte() { // Overload the main fn().
- return (*this.*Mode)(); // Call the Internal function for this mode.
- }
-
- FilterChainHeaderAnalysis(FilterChain* S, FilterChainIPTester& T) : // Construct with the chain and a tester.
- FilterChain(S), // Capture the chain.
- IPTester(T), // Capture the tester.
- IPToTest(""), // IPToTest and
- IPTestResult(""), // IPTestResult are both empty to start.
- FoundFrom(false), // Set all of the "found" bits to false.
- FoundTo(false),
- FoundCC(false),
- FoundMessageID(false),
- FoundDate(false),
- FoundSubject(false),
- FoundHighBitCharacters(false),
- Mode(&FilterChainHeaderAnalysis::doSeekDispatch) { // Start in SeekDispatch() mode
- } // -- first byte of a new line ;-)
-
- bool MissingFrom() { return (!FoundFrom); } // True if missing From header.
- bool MissingTo() { return (!FoundTo); } // True if missing To header.
- bool MissingCC() { return (!FoundCC); } // True if missing CC header.
- bool MissingSubject() { return (!FoundSubject); } // True if missing Subject header.
- bool MissingDate() { return (!FoundDate); } // True if missing Date header.
- bool MissingMessageID() { return (!FoundDate); } // True if missing MessageID header.
- bool HighBitCharacters() { return (FoundHighBitCharacters); } // True if High bit characters were found.
-
- };
-
- #endif
|