You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

FilterChain.hpp 37KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768
  1. // FilterChain.hpp
  2. //
  3. // (C) 2002-2009 MicroNeil Research Corporation
  4. //
  5. // This is the base class header for FilterChain objects.
  6. // FilterChain objects can be chained together to filter
  7. // a byte stream. Each object produces a single character
  8. // per call. It will also call it's source object for the
  9. // next character as required.
  10. // History...
  11. // 20060822 _M
  12. // Adding FilterChainHeaderAnalysis to identify missing headers and header
  13. // anomalies, and to extract and test IP data.
  14. // 20060127 _M
  15. // Added FilterChainCBFG to accept a buffer of a specific
  16. // length.
  17. // 20041116 _M Added UrlDecode module. The module will repeat a decoded version of
  18. // any anchor tag that it sees which contains decodable %xx bytes. Other anchor
  19. // tags are not repeated.
  20. // 20041116 _M Upgrades to the Defunker module. The module now decodes any HTML
  21. // encoded bytes that could have been normal ascii.
  22. // 20041114 _M Completed basic defunker engine which strips out all HTML and some
  23. // basic   encoding.
  24. // 20041113 _M Began heavy upgrades to this module to improve performance and
  25. // provide additional obfuscation removal. This modification will include a move
  26. // from the use of switch(State) mechanisms to the use of function pointers. This
  27. // should save a few cycles on every byte processed.
  28. // 20021025 _M
  29. // Added FilterChainCString to accept a Null Terminated
  30. // String (CString). Except for the input form it operates
  31. // exactly like the FilterChainInput form as modified below.
  32. // This allows WebClay to deliver the message using a buffer
  33. // rather than a file.
  34. // 20021015 _M
  35. // Modified FilterChainInput to eat control characters and
  36. // <CR> bytes so that the input stream "appears" always to
  37. // be terminated in the *nix standard \n. Tabs are also passed
  38. // but all other low bytes are eaten.
  39. // 20020721 _M File Created.
  40. // This is the base class - nothing special happens here
  41. // except defining the basic format of a FilterChain object.
  42. // If this object is instantiated, then it will simply return
  43. // it's source's data, or a stream of '0's if none has been
  44. // defined.
  45. #ifndef _MN_FilterChain
  46. #define _MN_FilterChain
  47. #include <stdexcept>
  48. #include <iostream>
  49. #include <sstream>
  50. #include <string>
  51. #include <cstring>
  52. #include <cstdlib>
  53. #include <cctype>
  54. using namespace std;
  55. // Define parameters for this module.
  56. const static int ScanBufferSize = 128; // Define the buffer size.
  57. // Define the base class.
  58. class FilterChain {
  59. private:
  60. FilterChain* Source; // Where we get our data.
  61. public:
  62. class BadSource : public invalid_argument { // Bad Source Exception.
  63. public: BadSource(const string& w):invalid_argument(w){}
  64. };
  65. class Empty : public underflow_error { // Empty Exception.
  66. public: Empty(const string& w):underflow_error(w){}
  67. };
  68. virtual unsigned char GetByte() { // Return either 0
  69. if(NULL==Source) return 0; // if we have no source
  70. else return Source->GetByte(); // otherwise it's byte.
  71. }
  72. FilterChain(){Source=NULL;} // Default Constructor no source.
  73. // The next constructor throws an error if no source is defined.
  74. FilterChain(FilterChain* S) {
  75. if(NULL==S) throw BadSource("FilterChain: NULL source not valid");
  76. else Source = S;
  77. }
  78. };
  79. // FilterChainInput
  80. // This version of FilterChain accepts an istream as a source and
  81. // gets a single character from it at each GetByte();
  82. class FilterChainInput : public FilterChain {
  83. private:
  84. istream* SourceIstream;
  85. public:
  86. // Here we overload the GetByte() function to get a byte
  87. // from the source stream. This is a litle bit special because
  88. // we're going to start our filtering process. Since we are
  89. // filtering text streams for pattern matching systems we will
  90. // eat any special control characters we get - including <CR>.
  91. // This helps us standardize on a *nix model for line ends as
  92. // each line end will be \n. It also gets rid of a lot of junk.
  93. unsigned char GetByte() { // Get the next byte.
  94. char i; // Keep it here.
  95. do{ // Loop to eat junk.
  96. SourceIstream->get(i); // Read the next byte...
  97. if(!SourceIstream->good()) // If something went wrong then
  98. throw Empty("FilterChain: No more data"); // throw the empty exception.
  99. if(i >= ' ') break; // Send all good bytes right away.
  100. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
  101. // Otherwise quietly eat anything
  102. } while(true); // less than a space.
  103. return i; // Return the latest byte...
  104. }
  105. // Here we overload the constructor to accept a stream.
  106. FilterChainInput(istream* S){ // Build me with a stream.
  107. if(NULL==S) throw BadSource("FilterChainInput: Null source not valid" ); // If it's NULL that's bad.
  108. if(!S->good()) throw BadSource("FilterChainInput: Bad istream"); // Not good is bad.
  109. else SourceIstream = S; // If it's good we keep it.
  110. }
  111. FilterChainInput() { // If we don't have a source then
  112. throw BadSource("FilterChainInput: Source required"); // we're no good.
  113. }
  114. };
  115. // FilterChainCString
  116. // This version sources the data for the chain from a message buffer, or
  117. // more precisely a null terminated string. The basic operation is identical
  118. // to that of FilterChainInput above except that we're not working with
  119. // a filestream as an input.
  120. class FilterChainCString : public FilterChain {
  121. private:
  122. unsigned char* InputBuffer;
  123. int BufferIndex;
  124. public:
  125. // Here we overload GetByte() just like we do in FilterChainInput
  126. // except that we're going to get our data from a NULL terminated
  127. // string instead of a stream. IN FACT ... the code below was simply
  128. // copied from FilterChainInput and modified in place.
  129. unsigned char GetByte() { // Get the next byte.
  130. unsigned char i; // Keep it here.
  131. do{ // Loop to eat junk.
  132. i = InputBuffer[BufferIndex++]; // Read the next byte...
  133. if(0 == i) // If there's nothing left then
  134. throw Empty("FilterChainCString: No more data"); // throw the empty exception.
  135. if(i >= ' ') break; // Send all good bytes right away.
  136. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
  137. // Otherwise quietly eat anything
  138. } while(true); // less than a space.
  139. return i; // Return the latest byte...
  140. }
  141. // Here we overload the constructor to accept a stream.
  142. FilterChainCString(unsigned char* S){ // Build me with a char buffer.
  143. if(NULL==S) throw BadSource("FilterChainCString: NULL source not valid"); // If it's NULL that's bad.
  144. if(0==S[0]) throw BadSource("FilterChainCString: Empty source not valid"); // Empty is bad.
  145. else InputBuffer = S; // If it's good we keep it.
  146. BufferIndex = 0; // Always start at index 0.
  147. }
  148. FilterChainCString() { // If we don't have a source then
  149. throw BadSource("FilterChainCString: Source required"); // we're no good.
  150. }
  151. };
  152. // FilterChainCBFR
  153. // This version sources the data for the chain from a message buffer, NOT
  154. // a null terminated string. The basic operation is identical to FilterChainCString
  155. // except that this version requires the length of the buffer and stops when that
  156. // number of characters have been read.
  157. class FilterChainCBFR : public FilterChain {
  158. private:
  159. unsigned char* InputBuffer;
  160. int BufferIndex;
  161. int BufferLength;
  162. stringstream& PrependedHeaders;
  163. bool PrependNotBuffer;
  164. public:
  165. // Here we overload GetByte() just like we do in FilterChainInput
  166. // except that we're going to get our data from a known length char
  167. // buffer instead of a stream. IN FACT ... the code below was simply
  168. // copied from FilterChainCString and modified in place.
  169. unsigned char GetByte() { // Get the next byte.
  170. unsigned char i; // Keep it here.
  171. if(PrependNotBuffer) { // While in prepend mode:
  172. if(BufferIndex < PrependedHeaders.str().length()) { // If there is more to get
  173. i = PrependedHeaders.str().at(BufferIndex); // then get it and move
  174. ++BufferIndex; // the index.
  175. } else { // As soon as we run out
  176. PrependNotBuffer = false; // of prepended headers switch
  177. BufferIndex = 0; // to the CBFR and reset the index.
  178. return GetByte(); // Recurse to get the next byte.
  179. }
  180. } else { // While in buffer mode:
  181. do{ // Loop to eat junk.
  182. if(BufferLength <= BufferIndex) // If there's nothing left then
  183. throw Empty("FilterChainCBFR: No more data"); // throw the empty exception.
  184. i = InputBuffer[BufferIndex++]; // Read the next byte...
  185. if(i >= ' ') break; // Send all good bytes right away.
  186. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
  187. // Otherwise quietly eat anything
  188. } while(true); // less than a space.
  189. }
  190. return i; // Return the latest byte...
  191. }
  192. // Here we overload the constructor to accept a stream.
  193. FilterChainCBFR(unsigned char* S, int l, stringstream& P) : // Give me a bfr and a stringstream.
  194. InputBuffer(S), // Grab the buffer,
  195. BufferLength(l), // Grab the buffer length,
  196. BufferIndex(0), // Initialize the index to 0,
  197. PrependedHeaders(P), // Grab the PrependedHeaders reference.
  198. PrependNotBuffer(true) { // Do PrependedHeaders first.
  199. if(NULL==S) throw BadSource("FilterChainCBFR: NULL source not valid"); // If it's NULL that's bad.
  200. if(0==l && 0==P.str().length())
  201. throw BadSource("FilterChainCBFR: Empty source not valid"); // Empty is bad.
  202. }
  203. };
  204. // FilterChainBase64
  205. // This version decodes base64 content in email messages. It begins
  206. // to decode this as soon as it sees the following message and two
  207. // blank lines indicating the coding has started.
  208. //
  209. // Content-Transfer-Encoding: base64
  210. //
  211. // Once it sees a bad character or what appears to be the start of
  212. // a new MIME segment, the filter turns off and passes through it's
  213. // source data.
  214. // The startup string for this filter is below. In this case we keep the
  215. // <LF> part of the string to ensure we will be looking at the start
  216. // of a line when we match.
  217. const static unsigned char Base64Start[] = "\nContent-Transfer-Encoding: base64";
  218. // The following table makes conversion fast because it's all lookups. The
  219. // special value XX64 is used everywhere a bad byte is found in the table.
  220. const static unsigned char XX64 = 0xFF;
  221. // Note the special case '=' is used for pad. It is given the value 0x00.
  222. // The input to this table is the incoming byte. The output is either XX64
  223. // or a valid base64 numerical value.
  224. const static unsigned char Base64Table[256] = {
  225. // 0 1 2 3 4 5 6 7 8 9 A B C D E F
  226. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 0
  227. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 1
  228. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,0x3E,XX64,XX64,XX64,0x3F, // 2
  229. 0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,XX64,XX64,XX64,0x00,XX64,XX64, // 3
  230. XX64,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E, // 4
  231. 0x0F,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,XX64,XX64,XX64,XX64,XX64, // 5
  232. XX64,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28, // 6
  233. 0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,0x30,0x31,0x32,0x33,XX64,XX64,XX64,XX64,XX64, // 7
  234. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 8
  235. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 9
  236. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // A
  237. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // B
  238. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // C
  239. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // D
  240. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // E
  241. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64 // F
  242. };
  243. // The following constants are used to find segment positions when converting from
  244. // 4 six bit values to 3 octets.
  245. const static unsigned char base64_seg0_shift = 18;
  246. const static unsigned char base64_seg1_shift = 12;
  247. const static unsigned char base64_seg2_shift = 6;
  248. const static unsigned char base64_seg3_shift = 0;
  249. class FilterChainBase64 : public FilterChain {
  250. private:
  251. unsigned char x,y; // We need a few holding bins.
  252. unsigned int Workspace; // Numerical workspace for conversion.
  253. enum FilterState { // Operating State Codes.
  254. SCANNING, // One-in = One-out, looking for startup.
  255. DEQUEING, // Delivering buffered data.
  256. DECODING // Delivering filtered data.
  257. } State;
  258. int ScanIx; // Scanning Index.
  259. int DequeIx; // Dequeing Index.
  260. unsigned char Buffer; // Define a buffer.
  261. bool ValidByte(unsigned char y); // True if y can be decoded.
  262. public:
  263. unsigned char GetByte(); // Overload the main fn().
  264. FilterChainBase64(FilterChain* S) // Sourced constructor...
  265. :FilterChain(S){ // Call the base constructor.
  266. State = SCANNING; // Set filter inactive.
  267. ScanIx=DequeIx=0; // Reset our indexes.
  268. } // We're all ready to start.
  269. FilterChainBase64() { // Don't allow any
  270. throw BadSource("FilterChainBase64: Source required"); // null constructors.
  271. }
  272. };
  273. // FilterChainQuotedPrintable
  274. // This version decodes quoted-printable content in email messages.
  275. //
  276. // For simplicity this one is always on. That is, whenever it sees a
  277. // convertable quoted printable byte it will exchange it for the byte
  278. // that is represented. This is only intended for operation preceeding the
  279. // spam filter engine so it is safe to make these conversions.
  280. class FilterChainQuotedPrintable : public FilterChain {
  281. private:
  282. long int Workspace; // Plain Text Workspace.
  283. enum FilterState { // Operating State Codes
  284. SCANNING, // One-in = One-out - looking for start-up.
  285. DEQUEING, // Delivering buffered data.
  286. DECODING // Delivering filtered data.
  287. } State;
  288. int BufferLength; // How full is the buffer.
  289. int BufferIndex; // What byte are we on?
  290. unsigned char Buffer[ScanBufferSize]; // Define the buffer.
  291. bool isHexDigit(unsigned char i); // true if i is a hex digit byte.
  292. int convertHexDigit(unsigned char i); // returns integer value of hex digit i.
  293. public:
  294. unsigned char GetByte(); // Overload the main fn().
  295. FilterChainQuotedPrintable(FilterChain* S) // Sourced constructor...
  296. :FilterChain(S){ // Call the base constructor.
  297. State = SCANNING; // Set to the initial state.
  298. BufferIndex = 0; // Initial buffer index.
  299. BufferLength = 0; // Initial buffer length.
  300. Workspace = 0; // Clear the workspace.
  301. }
  302. FilterChainQuotedPrintable() { // Don't allow any
  303. throw BadSource("FilterChainQuotedPrintable: Source required"); // null constructors.
  304. }
  305. };
  306. // FilterChainDefunker
  307. // This module stores a copy of the stream containing HTML and then emits it
  308. // at the end of the stream with all of the html elements removed and/or decoded
  309. // to eliminate html based obfuscation.
  310. class FilterChainDefunker;
  311. static const int DefunkerSize = 32768; // Store size.
  312. static const int DefunkerQueueSize = 24; // Size of defunker queue.
  313. static const char* DefunkerPreamble = " ----[DEFUNKER]---- ";
  314. // Patterns to match
  315. static const char* patMatchBR = "<br>";
  316. static const char* patMatchP = "<p>";
  317. static const char* patNBSP = "&nbsp;";
  318. static const char* patAMP = "&amp;";
  319. static const char* patAPOS = "&apos;";
  320. static const char* patLT = "&lt;";
  321. static const char* patGT = "&gt;";
  322. static const char* patQUOT = "&quot;";
  323. class FilterChainDefunker : public FilterChain { // Class definition.
  324. private:
  325. unsigned char StoreBuffer[DefunkerSize];
  326. int InputPosition;
  327. int OutputPosition;
  328. // Nodes in the state change model are represented by functions.
  329. // These modes represent the state prior to getting the Empty exception.
  330. // During this mode, the Defunker simply stores a portion of the message
  331. // to be scanned later.
  332. unsigned char LastRawByte; // Last Raw Byte (for SkipHeaders);
  333. unsigned char SkipHeaders(); // Skips the headers before Store();
  334. unsigned char Store(); // Stores the message content for later.
  335. // Here is a handy Queue mechanism for recovering failed patterns.
  336. int QueueLength; // Queue Length (write position).
  337. int QueuePosition; // Queue Read Position.
  338. unsigned char Qbfr[DefunkerQueueSize]; // Queue Buffer.
  339. void ClearQueue() { // Clear the queue.
  340. memset(Qbfr,0,sizeof(Qbfr)); // Reset the buffer.
  341. QueueLength = 0; // Zero the length.
  342. QueuePosition = 0; // Zero the position.
  343. }
  344. unsigned char DeQueue() { // Empty the queue then back to DefunkRoot.
  345. if(QueuePosition >= QueueLength) { // If the queue is empty then
  346. ClearQueue(); // clear the queue,
  347. Internal = &FilterChainDefunker::DefunkRoot; // go back to DefunkRoot mode,
  348. return GetInternal(); // and return the next byte.
  349. } // If the queue is not empty then
  350. return Qbfr[QueuePosition++]; // return the next byte from the queue.
  351. }
  352. void EnQueue(unsigned char x) { // Add a byte to the queue.
  353. if(QueueLength<DefunkerQueueSize) // If we are safely within the buffer
  354. Qbfr[QueueLength++] = x; // then add this byte to the queue.
  355. }
  356. // These modes represent the Defunker pulling data out of it's
  357. // stored copy so that it can be filtered and delivered to the scanner.
  358. // These modes get turned on once the Empty exception is read from
  359. // the underlying source.
  360. unsigned char Preamble(); // Preamble - separates Defunked text.
  361. unsigned char DefunkRoot(); // Root in Defunk mode.
  362. unsigned char OpenTag(); // Open tag detected.
  363. unsigned char OpenAmp(); // Open & tag.
  364. unsigned char MatchBR(); // Matching <br>
  365. unsigned char MatchP(); // Matching <p>
  366. unsigned char MatchNBSP(); // Matching &nbps;
  367. unsigned char SwitchAMPAPOS(); // Looking for AMP or APOS.
  368. unsigned char MatchAMP(); // Matching &amp;
  369. unsigned char MatchAPOS(); // Matching &apos;
  370. unsigned char MatchLT(); // Matching &lt;
  371. unsigned char MatchGT(); // Matching &gt;
  372. unsigned char MatchQUOT(); // Matching &quot;
  373. unsigned char EatTag(); // Eating an unknown tag.
  374. unsigned char DecodeNum(); // Decoding &#...number...;
  375. // Part of defunking is to convert all runs of whitespace into a single space.
  376. // It also doubles as the master output function once we're out of Store() mode.
  377. unsigned char SpaceConvChart[256]; // Space conversion chart.
  378. unsigned char LastReadOut; // Last ReadOut byte (for deduping spaces).
  379. unsigned char ReadOut(); // Read out the store through the filter.
  380. unsigned char LastGetStore; // Last GetStore byte (for EatTag).
  381. unsigned char GetStore(); // Read a byte from the store.
  382. // Here is a handy pattern match function for eliminating some tags.
  383. bool MatchTagPattern(const char* pattern) { // Matches pattern. True if matched.
  384. int pos = 2; // Now on the third byte (index 2).
  385. while(pattern[pos]){ // While we have more bytes to match
  386. unsigned char x = GetStore(); // grab the next byte.
  387. // Special case - HTML tag with a space as in <p stuff>
  388. if(x==' ' && pattern[pos]=='>') { // If we have a tag with parameters.
  389. pos++; // Move pos forward to it's null.
  390. while(GetStore()!='>')continue; // Eat up to the > and then
  391. break; // we are done.
  392. }
  393. // In the normal case follow the pattern.
  394. if(tolower(x)!=pattern[pos]) break; // If we fell off then stop.
  395. pos++; // If we didn't break move ahead.
  396. }
  397. // At this point we are either at the null in our pattern or we did not match.
  398. if(pattern[pos]) { return false; } // If we're not at the end then no match.
  399. return true; // Otherwise we do have a match :-)
  400. }
  401. // These are the function pointers that map the current state of this object.
  402. unsigned char (FilterChainDefunker::*Master)(); // Master function for GetByte()
  403. unsigned char (FilterChainDefunker::*Internal)(); // Internal function for GetByte()
  404. public:
  405. unsigned char GetByte() { // Overload the main fn().
  406. return (*this.*Master)(); // Call the master function.
  407. }
  408. unsigned char GetInternal() { // Internal state machine get.
  409. return (*this.*Internal)(); // Call the internal function.
  410. }
  411. FilterChainDefunker(FilterChain* S) // Sourced constructor...
  412. :FilterChain(S), // Call the base constructor.
  413. Master(&FilterChainDefunker::SkipHeaders), // Set the initial external and
  414. Internal(&FilterChainDefunker::Preamble), // internal states.
  415. InputPosition(0), // Reset both position pointers.
  416. OutputPosition(0),
  417. LastReadOut(0),
  418. LastGetStore(0),
  419. LastRawByte(0) {
  420. ClearQueue(); // Clear the queue;
  421. memset(StoreBuffer,0,sizeof(StoreBuffer)); // Clear the store buffer.
  422. for(int i=0;i<256;i++) SpaceConvChart[i]=i; // Initialize the chart.
  423. SpaceConvChart[(int)'\r']=' '; // Convert <CR> to space.
  424. SpaceConvChart[(int)'\n']=' '; // Convert <LF> to space.
  425. SpaceConvChart[(int)'\t']=' '; // Convert Tab to space.
  426. }
  427. FilterChainDefunker() { // Don't allow any
  428. throw BadSource("FilterChainDefunker: Source required"); // null constructors.
  429. }
  430. };
  431. // FilterChainUrlDecode
  432. // This module removes any unnecessary URL encoding within an <a...> tag. The
  433. // cleaned up version (if different) is emitted immediately after the original
  434. // <a...> tag so that both versions can be interpreted by the pattern scanner.
  435. // This is designed to eliminate common obfuscation techniques.
  436. const int UrlDecodeBfrSize = 256; // Decode Buffer Size.
  437. class FilterChainUrlDecode : public FilterChain {
  438. private:
  439. unsigned char DecodeBfr[UrlDecodeBfrSize]; // Decoded anchor buffer.
  440. int DecodeLength; // Decoded anchor length.
  441. int DecodePosition; // Read (Inject) Position.
  442. bool DecodeFlag; // True if the URL was decoded.
  443. void Clear() { // Function to clear the bfr.
  444. memset(DecodeBfr,0,sizeof(DecodeBfr)); // Null it out and set
  445. DecodeLength = 0; // the length to zero.
  446. DecodePosition = 0; // Reset the Read position.
  447. DecodeFlag = false; // Reset the Decode Flag.
  448. }
  449. void AddToBfr(unsigned char c) { // Safely add to our buffer.
  450. if(DecodeLength < sizeof(DecodeBfr)-1) // If we have more room then
  451. DecodeBfr[DecodeLength++] = c; // write the incoming byte.
  452. }
  453. unsigned char (FilterChainUrlDecode::*Internal)(); // Internal State Fn
  454. bool isHexDigit(unsigned char i); // Is i a hex digit?
  455. int convertHexDigit(unsigned char i); // Convert a single hex digit.
  456. unsigned char convertHexByte(unsigned char* x); // Convert a hex byte.
  457. // Here are the states of the UrlDecode module...
  458. unsigned char Bypass(); // Bypass - waiting for '<'
  459. unsigned char Tag(); // Looks for an 'a' or 'i' after '<'
  460. unsigned char Img1(); // Looks for 'm' in <img
  461. unsigned char Img2(); // Looks for 'g' in <img
  462. unsigned char Root(); // Root state of the decode FSM.
  463. unsigned char GetD1(); // Decoding step one.
  464. unsigned char GetD2(); // Decoding step two.
  465. unsigned char Inject(); // Injects the bfr into the stream.
  466. public:
  467. unsigned char GetByte() { // Overload the main fn().
  468. return (*this.*Internal)(); // Call the Internal function.
  469. }
  470. FilterChainUrlDecode(FilterChain* S) // Sourced constructor...
  471. :FilterChain(S), // Call the base constructor.
  472. Internal(&FilterChainUrlDecode::Bypass) { // Set ByPass mode.
  473. Clear(); // Clear the system.
  474. }
  475. FilterChainUrlDecode() { // Don't allow any
  476. throw BadSource("FilterChainUrlDecode: Source required"); // null constructors.
  477. }
  478. };
  479. // FilterChainHeaderAnalysis (and friends)
  480. // Performs header anomaly analysis and IP extraction and analysis.
  481. // IP Analysis is peformed via a provided class that implements the IPTester
  482. // interface. An IP is provided to the IPTester as a [#.#.#.#] string. The
  483. // IPTester may respond with information to be emitted into the headers for
  484. // the pattern matching engine based on those results --- or not ;-)
  485. class FilterChainIPTester {
  486. public:
  487. virtual string& test(string& input, string& output) = 0;
  488. };
  489. // The supplied test() function accepts the input string and returns the
  490. // output string. If desired, the output string can be modified to include
  491. // data from the tests that will be emitted into the data stream for the
  492. // pattern analysis engine to see. Otherwise, the output string should
  493. // remain blank. The test() function _should_ be thread safe -- that is why
  494. // we pass it both input and output ;-)
  495. //
  496. // The provided tester may have any side-effects that are desired.
  497. class FilterChainHeaderAnalysis : public FilterChain {
  498. private:
  499. unsigned char (FilterChainHeaderAnalysis::*Mode)(); // Internal State Fn Pointer (What Mode)
  500. FilterChainIPTester& IPTester; // This is the IP tester we use.
  501. string IPToTest; // String to capture IPs for testing.
  502. string IPTestResult; // String to receive IPtest results.
  503. // Header analysis output state...
  504. string EndOfHeaderResults; // String to capture EndOfHeaderResults.
  505. // OutputIndex and OutputLength are used to inject string data.
  506. // These are used to inject IPTestResult data and Header Analysis data.
  507. char* OutputBuffer; // Pointer to output injection string.
  508. int OutputIndex; // End of header output results index.
  509. void SetOutputBuffer(string& s); // Setup the OutputBuffer.
  510. unsigned char doInjectIPTestResult(); // Inject OutputBuffer and go to doSeekNL.
  511. unsigned char doInjectAnalysis(); // Inject OutputBuffer and go to doOff.
  512. // Header seek pattern state...
  513. // These tools work to follow patterns for header tags.
  514. // SetFollowPattern resets the engine and establishes the pattern to follow.
  515. // FollowPattern checks c against the next byte in the pattern.
  516. // -1 = The pattern failed.
  517. // 1 = The pattern was followed.
  518. // 0 = The pattern is complete.
  519. const char* MatchPattern; // Current pattern to match.
  520. int MatchIndex; // Pattern match following index.
  521. void SetFollowPattern(const char* p) { MatchPattern = p; MatchIndex = 0; } // Set the pattern to follow.
  522. int FollowPattern(char c); // Follow the pattern.
  523. //// Internal modes for this module...
  524. unsigned char doSeekNL(); // Looking for a new line.
  525. unsigned char doSeekDispatch(); // Looking at the first char after NL.
  526. unsigned char doReceived(); // Identifying a Received: header.
  527. unsigned char doFindIP(); // Seeking the [IP] in a Received header.
  528. unsigned char doTestIP(); // Gets and tests the [IP].
  529. unsigned char doFrom(); // Identifying a From: header.
  530. unsigned char doTo(); // Identifying a To: header.
  531. unsigned char doCC(); // Identifying a CC: header.
  532. unsigned char doMessageID(); // Identifying a MessageID header.
  533. unsigned char doDate(); // Identifying a Date: header.
  534. unsigned char doSubject(); // Identifying a Subject: header.
  535. unsigned char doEndOfHeaders(); // IdentifyEndOfHeaders & Emit Results.
  536. unsigned char doOff() { return FilterChain::GetByte(); } // Bypass mode.
  537. bool FoundFrom; // True if From: was found.
  538. bool FoundTo; // True if To: was found.
  539. bool FoundCC; // True if CC: was found.
  540. bool FoundMessageID; // True if Message-ID: was found.
  541. bool FoundDate; // True if Date: was found.
  542. bool FoundSubject; // True if Subject: was found.
  543. bool FoundHighBitCharacters; // True if high bit characters were found.
  544. unsigned char GetCheckedByte() { // Internal GetByte & check for high bits.
  545. unsigned char x = FilterChain::GetByte(); // Get the byte from up the chain.
  546. if(0 < (x & 0x80)) { // Check for a high bit byte (non-ascii).
  547. FoundHighBitCharacters = true; // If it is found then set the flag.
  548. } // If not then at least we checked ;-)
  549. return x; // Return the byte.
  550. }
  551. public:
  552. unsigned char GetByte() { // Overload the main fn().
  553. return (*this.*Mode)(); // Call the Internal function for this mode.
  554. }
  555. FilterChainHeaderAnalysis(FilterChain* S, FilterChainIPTester& T) : // Construct with the chain and a tester.
  556. FilterChain(S), // Capture the chain.
  557. IPTester(T), // Capture the tester.
  558. IPToTest(""), // IPToTest and
  559. IPTestResult(""), // IPTestResult are both empty to start.
  560. FoundFrom(false), // Set all of the "found" bits to false.
  561. FoundTo(false),
  562. FoundCC(false),
  563. FoundMessageID(false),
  564. FoundDate(false),
  565. FoundSubject(false),
  566. FoundHighBitCharacters(false),
  567. Mode(&FilterChainHeaderAnalysis::doSeekDispatch) { // Start in SeekDispatch() mode
  568. } // -- first byte of a new line ;-)
  569. bool MissingFrom() { return (!FoundFrom); } // True if missing From header.
  570. bool MissingTo() { return (!FoundTo); } // True if missing To header.
  571. bool MissingCC() { return (!FoundCC); } // True if missing CC header.
  572. bool MissingSubject() { return (!FoundSubject); } // True if missing Subject header.
  573. bool MissingDate() { return (!FoundDate); } // True if missing Date header.
  574. bool MissingMessageID() { return (!FoundDate); } // True if missing MessageID header.
  575. bool HighBitCharacters() { return (FoundHighBitCharacters); } // True if High bit characters were found.
  576. };
  577. #endif