Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757
  1. // FilterChain.hpp
  2. //
  3. // (C) 2002-2009 MicroNeil Research Corporation
  4. //
  5. // This is the base class header for FilterChain objects.
  6. // FilterChain objects can be chained together to filter
  7. // a byte stream. Each object produces a single character
  8. // per call. It will also call it's source object for the
  9. // next character as required.
  10. // History...
  11. // 20060822 _M
  12. // Adding FilterChainHeaderAnalysis to identify missing headers and header
  13. // anomalies, and to extract and test IP data.
  14. // 20060127 _M
  15. // Added FilterChainCBFG to accept a buffer of a specific
  16. // length.
  17. // 20041116 _M Added UrlDecode module. The module will repeat a decoded version of
  18. // any anchor tag that it sees which contains decodable %xx bytes. Other anchor
  19. // tags are not repeated.
  20. // 20041116 _M Upgrades to the Defunker module. The module now decodes any HTML
  21. // encoded bytes that could have been normal ascii.
  22. // 20041114 _M Completed basic defunker engine which strips out all HTML and some
  23. // basic   encoding.
  24. // 20041113 _M Began heavy upgrades to this module to improve performance and
  25. // provide additional obfuscation removal. This modification will include a move
  26. // from the use of switch(State) mechanisms to the use of function pointers. This
  27. // should save a few cycles on every byte processed.
  28. // 20021025 _M
  29. // Added FilterChainCString to accept a Null Terminated
  30. // String (CString). Except for the input form it operates
  31. // exactly like the FilterChainInput form as modified below.
  32. // This allows WebClay to deliver the message using a buffer
  33. // rather than a file.
  34. // 20021015 _M
  35. // Modified FilterChainInput to eat control characters and
  36. // <CR> bytes so that the input stream "appears" always to
  37. // be terminated in the *nix standard \n. Tabs are also passed
  38. // but all other low bytes are eaten.
  39. // 20020721 _M File Created.
  40. // This is the base class - nothing special happens here
  41. // except defining the basic format of a FilterChain object.
  42. // If this object is instantiated, then it will simply return
  43. // it's source's data, or a stream of '0's if none has been
  44. // defined.
  45. #ifndef _MN_FilterChain
  46. #define _MN_FilterChain
  47. #include <stdexcept>
  48. #include <iostream>
  49. #include <sstream>
  50. #include <string>
  51. #include <cstring>
  52. #include <cstdlib>
  53. #include <cctype>
  54. using namespace std;
  55. // Define parameters for this module.
  56. const static int ScanBufferSize = 128; // Define the buffer size.
  57. // Define the base class.
  58. class FilterChain {
  59. private:
  60. FilterChain* Source; // Where we get our data.
  61. public:
  62. class BadSource : public invalid_argument { // Bad Source Exception.
  63. public: BadSource(const string& w):invalid_argument(w){}
  64. };
  65. class Empty : public underflow_error { // Empty Exception.
  66. public: Empty(const string& w):underflow_error(w){}
  67. };
  68. virtual unsigned char GetByte() { // Return either 0
  69. if(NULL==Source) return 0; // if we have no source
  70. else return Source->GetByte(); // otherwise it's byte.
  71. }
  72. FilterChain(){Source=NULL;} // Default Constructor no source.
  73. // The next constructor throws an error if no source is defined.
  74. FilterChain(FilterChain* S) {
  75. if(NULL==S) throw BadSource("FilterChain: NULL source not valid");
  76. else Source = S;
  77. }
  78. virtual ~FilterChain() {} // Stop Warns about no virt dtor
  79. };
  80. // FilterChainInput
  81. // This version of FilterChain accepts an istream as a source and
  82. // gets a single character from it at each GetByte();
  83. class FilterChainInput : public FilterChain {
  84. private:
  85. istream* SourceIstream;
  86. public:
  87. // Here we overload the GetByte() function to get a byte
  88. // from the source stream. This is a litle bit special because
  89. // we're going to start our filtering process. Since we are
  90. // filtering text streams for pattern matching systems we will
  91. // eat any special control characters we get - including <CR>.
  92. // This helps us standardize on a *nix model for line ends as
  93. // each line end will be \n. It also gets rid of a lot of junk.
  94. unsigned char GetByte() { // Get the next byte.
  95. char i; // Keep it here.
  96. do{ // Loop to eat junk.
  97. SourceIstream->get(i); // Read the next byte...
  98. if(!SourceIstream->good()) // If something went wrong then
  99. throw Empty("FilterChain: No more data"); // throw the empty exception.
  100. if(i >= ' ') break; // Send all good bytes right away.
  101. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
  102. // Otherwise quietly eat anything
  103. } while(true); // less than a space.
  104. return i; // Return the latest byte...
  105. }
  106. // Here we overload the constructor to accept a stream.
  107. FilterChainInput(istream* S){ // Build me with a stream.
  108. if(NULL==S) throw BadSource("FilterChainInput: Null source not valid" ); // If it's NULL that's bad.
  109. if(!S->good()) throw BadSource("FilterChainInput: Bad istream"); // Not good is bad.
  110. else SourceIstream = S; // If it's good we keep it.
  111. }
  112. FilterChainInput() { // If we don't have a source then
  113. throw BadSource("FilterChainInput: Source required"); // we're no good.
  114. }
  115. };
  116. // FilterChainCString
  117. // This version sources the data for the chain from a message buffer, or
  118. // more precisely a null terminated string. The basic operation is identical
  119. // to that of FilterChainInput above except that we're not working with
  120. // a filestream as an input.
  121. class FilterChainCString : public FilterChain {
  122. private:
  123. unsigned char* InputBuffer;
  124. int BufferIndex;
  125. public:
  126. // Here we overload GetByte() just like we do in FilterChainInput
  127. // except that we're going to get our data from a NULL terminated
  128. // string instead of a stream. IN FACT ... the code below was simply
  129. // copied from FilterChainInput and modified in place.
  130. unsigned char GetByte() { // Get the next byte.
  131. unsigned char i; // Keep it here.
  132. do{ // Loop to eat junk.
  133. i = InputBuffer[BufferIndex++]; // Read the next byte...
  134. if(0 == i) // If there's nothing left then
  135. throw Empty("FilterChainCString: No more data"); // throw the empty exception.
  136. if(i >= ' ') break; // Send all good bytes right away.
  137. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
  138. // Otherwise quietly eat anything
  139. } while(true); // less than a space.
  140. return i; // Return the latest byte...
  141. }
  142. // Here we overload the constructor to accept a stream.
  143. FilterChainCString(unsigned char* S){ // Build me with a char buffer.
  144. if(NULL==S) throw BadSource("FilterChainCString: NULL source not valid"); // If it's NULL that's bad.
  145. if(0==S[0]) throw BadSource("FilterChainCString: Empty source not valid"); // Empty is bad.
  146. else InputBuffer = S; // If it's good we keep it.
  147. BufferIndex = 0; // Always start at index 0.
  148. }
  149. FilterChainCString() { // If we don't have a source then
  150. throw BadSource("FilterChainCString: Source required"); // we're no good.
  151. }
  152. };
  153. // FilterChainCBFR
  154. // This version sources the data for the chain from a message buffer, NOT
  155. // a null terminated string. The basic operation is identical to FilterChainCString
  156. // except that this version requires the length of the buffer and stops when that
  157. // number of characters have been read.
  158. class FilterChainCBFR : public FilterChain {
  159. private:
  160. unsigned char* InputBuffer;
  161. unsigned int BufferLength;
  162. unsigned int BufferIndex;
  163. stringstream& PrependedHeaders;
  164. bool PrependNotBuffer;
  165. public:
  166. // Here we overload GetByte() just like we do in FilterChainInput
  167. // except that we're going to get our data from a known length char
  168. // buffer instead of a stream. IN FACT ... the code below was simply
  169. // copied from FilterChainCString and modified in place.
  170. unsigned char GetByte() { // Get the next byte.
  171. unsigned char i; // Keep it here.
  172. if(PrependNotBuffer) { // While in prepend mode:
  173. if(BufferIndex < PrependedHeaders.str().length()) { // If there is more to get
  174. i = PrependedHeaders.str().at(BufferIndex); // then get it and move
  175. ++BufferIndex; // the index.
  176. } else { // As soon as we run out
  177. PrependNotBuffer = false; // of prepended headers switch
  178. BufferIndex = 0; // to the CBFR and reset the index.
  179. return GetByte(); // Recurse to get the next byte.
  180. }
  181. } else { // While in buffer mode:
  182. do{ // Loop to eat junk.
  183. if(BufferLength <= BufferIndex) // If there's nothing left then
  184. throw Empty("FilterChainCBFR: No more data"); // throw the empty exception.
  185. i = InputBuffer[BufferIndex++]; // Read the next byte...
  186. if(i >= ' ') break; // Send all good bytes right away.
  187. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
  188. // Otherwise quietly eat anything
  189. } while(true); // less than a space.
  190. }
  191. return i; // Return the latest byte...
  192. }
  193. // Here we overload the constructor to accept a stream.
  194. FilterChainCBFR(unsigned char* S, int l, stringstream& P) : // Give me a bfr and a stringstream.
  195. InputBuffer(S), // Grab the buffer,
  196. BufferLength(l), // Grab the buffer length,
  197. BufferIndex(0), // Initialize the index to 0,
  198. PrependedHeaders(P), // Grab the PrependedHeaders reference.
  199. PrependNotBuffer(true) { // Do PrependedHeaders first.
  200. if(NULL==S) throw BadSource("FilterChainCBFR: NULL source not valid"); // If it's NULL that's bad.
  201. if(0==l && 0==P.str().length())
  202. throw BadSource("FilterChainCBFR: Empty source not valid"); // Empty is bad.
  203. }
  204. };
  205. // FilterChainBase64
  206. // This version decodes base64 content in email messages. It begins
  207. // to decode this as soon as it sees the following message and two
  208. // blank lines indicating the coding has started.
  209. //
  210. // Content-Transfer-Encoding: base64
  211. //
  212. // Once it sees a bad character or what appears to be the start of
  213. // a new MIME segment, the filter turns off and passes through it's
  214. // source data.
  215. // The startup string for this filter is below. In this case we keep the
  216. // <LF> part of the string to ensure we will be looking at the start
  217. // of a line when we match.
  218. const static unsigned char Base64Start[] = "\nContent-Transfer-Encoding: base64";
  219. // The following table makes conversion fast because it's all lookups. The
  220. // special value XX64 is used everywhere a bad byte is found in the table.
  221. const static unsigned char XX64 = 0xFF;
  222. // Note the special case '=' is used for pad. It is given the value 0x00.
  223. // The input to this table is the incoming byte. The output is either XX64
  224. // or a valid base64 numerical value.
  225. const static unsigned char Base64Table[256] = {
  226. // 0 1 2 3 4 5 6 7 8 9 A B C D E F
  227. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 0
  228. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 1
  229. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,0x3E,XX64,XX64,XX64,0x3F, // 2
  230. 0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,XX64,XX64,XX64,0x00,XX64,XX64, // 3
  231. XX64,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E, // 4
  232. 0x0F,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,XX64,XX64,XX64,XX64,XX64, // 5
  233. XX64,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28, // 6
  234. 0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,0x30,0x31,0x32,0x33,XX64,XX64,XX64,XX64,XX64, // 7
  235. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 8
  236. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 9
  237. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // A
  238. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // B
  239. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // C
  240. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // D
  241. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // E
  242. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64 // F
  243. };
  244. // The following constants are used to find segment positions when converting from
  245. // 4 six bit values to 3 octets.
  246. const static unsigned char base64_seg0_shift = 18;
  247. const static unsigned char base64_seg1_shift = 12;
  248. const static unsigned char base64_seg2_shift = 6;
  249. const static unsigned char base64_seg3_shift = 0;
  250. class FilterChainBase64 : public FilterChain {
  251. private:
  252. unsigned char x,y; // We need a few holding bins.
  253. unsigned int Workspace; // Numerical workspace for conversion.
  254. enum FilterState { // Operating State Codes.
  255. SCANNING, // One-in = One-out, looking for startup.
  256. DEQUEING, // Delivering buffered data.
  257. DECODING // Delivering filtered data.
  258. } State;
  259. unsigned int ScanIx; // Scanning Index.
  260. unsigned int DequeIx; // Dequeing Index.
  261. unsigned char Buffer; // Define a buffer.
  262. bool ValidByte(unsigned char y); // True if y can be decoded.
  263. public:
  264. unsigned char GetByte(); // Overload the main fn().
  265. FilterChainBase64(FilterChain* S) // Sourced constructor...
  266. :FilterChain(S){ // Call the base constructor.
  267. State = SCANNING; // Set filter inactive.
  268. ScanIx=DequeIx=0; // Reset our indexes.
  269. } // We're all ready to start.
  270. FilterChainBase64() { // Don't allow any
  271. throw BadSource("FilterChainBase64: Source required"); // null constructors.
  272. }
  273. };
  274. // FilterChainQuotedPrintable
  275. // This version decodes quoted-printable content in email messages.
  276. //
  277. // For simplicity this one is always on. That is, whenever it sees a
  278. // convertable quoted printable byte it will exchange it for the byte
  279. // that is represented. This is only intended for operation preceeding the
  280. // spam filter engine so it is safe to make these conversions.
  281. class FilterChainQuotedPrintable : public FilterChain {
  282. private:
  283. long int Workspace; // Plain Text Workspace.
  284. enum FilterState { // Operating State Codes
  285. SCANNING, // One-in = One-out - looking for start-up.
  286. DEQUEING, // Delivering buffered data.
  287. DECODING // Delivering filtered data.
  288. } State;
  289. int BufferLength; // How full is the buffer.
  290. int BufferIndex; // What byte are we on?
  291. unsigned char Buffer[ScanBufferSize]; // Define the buffer.
  292. bool isHexDigit(unsigned char i); // true if i is a hex digit byte.
  293. int convertHexDigit(unsigned char i); // returns integer value of hex digit i.
  294. public:
  295. unsigned char GetByte(); // Overload the main fn().
  296. FilterChainQuotedPrintable(FilterChain* S) // Sourced constructor...
  297. :FilterChain(S){ // Call the base constructor.
  298. State = SCANNING; // Set to the initial state.
  299. BufferIndex = 0; // Initial buffer index.
  300. BufferLength = 0; // Initial buffer length.
  301. Workspace = 0; // Clear the workspace.
  302. }
  303. FilterChainQuotedPrintable() { // Don't allow any
  304. throw BadSource("FilterChainQuotedPrintable: Source required"); // null constructors.
  305. }
  306. };
  307. // FilterChainDefunker
  308. // This module stores a copy of the stream containing HTML and then emits it
  309. // at the end of the stream with all of the html elements removed and/or decoded
  310. // to eliminate html based obfuscation.
  311. class FilterChainDefunker;
  312. static const int DefunkerSize = 32768; // Store size.
  313. static const int DefunkerQueueSize = 24; // Size of defunker queue.
  314. class FilterChainDefunker : public FilterChain { // Class definition.
  315. private:
  316. unsigned char StoreBuffer[DefunkerSize];
  317. int InputPosition;
  318. int OutputPosition;
  319. // Nodes in the state change model are represented by functions.
  320. // These modes represent the state prior to getting the Empty exception.
  321. // During this mode, the Defunker simply stores a portion of the message
  322. // to be scanned later.
  323. unsigned char LastRawByte; // Last Raw Byte (for SkipHeaders);
  324. unsigned char SkipHeaders(); // Skips the headers before Store();
  325. unsigned char Store(); // Stores the message content for later.
  326. // Here is a handy Queue mechanism for recovering failed patterns.
  327. int QueueLength; // Queue Length (write position).
  328. int QueuePosition; // Queue Read Position.
  329. unsigned char Qbfr[DefunkerQueueSize]; // Queue Buffer.
  330. void ClearQueue() { // Clear the queue.
  331. memset(Qbfr,0,sizeof(Qbfr)); // Reset the buffer.
  332. QueueLength = 0; // Zero the length.
  333. QueuePosition = 0; // Zero the position.
  334. }
  335. unsigned char DeQueue() { // Empty the queue then back to DefunkRoot.
  336. if(QueuePosition >= QueueLength) { // If the queue is empty then
  337. ClearQueue(); // clear the queue,
  338. Internal = &FilterChainDefunker::DefunkRoot; // go back to DefunkRoot mode,
  339. return GetInternal(); // and return the next byte.
  340. } // If the queue is not empty then
  341. return Qbfr[QueuePosition++]; // return the next byte from the queue.
  342. }
  343. void EnQueue(unsigned char x) { // Add a byte to the queue.
  344. if(QueueLength<DefunkerQueueSize) // If we are safely within the buffer
  345. Qbfr[QueueLength++] = x; // then add this byte to the queue.
  346. }
  347. // These modes represent the Defunker pulling data out of it's
  348. // stored copy so that it can be filtered and delivered to the scanner.
  349. // These modes get turned on once the Empty exception is read from
  350. // the underlying source.
  351. unsigned char Preamble(); // Preamble - separates Defunked text.
  352. unsigned char DefunkRoot(); // Root in Defunk mode.
  353. unsigned char OpenTag(); // Open tag detected.
  354. unsigned char OpenAmp(); // Open & tag.
  355. unsigned char MatchBR(); // Matching <br>
  356. unsigned char MatchP(); // Matching <p>
  357. unsigned char MatchNBSP(); // Matching &nbps;
  358. unsigned char SwitchAMPAPOS(); // Looking for AMP or APOS.
  359. unsigned char MatchAMP(); // Matching &amp;
  360. unsigned char MatchAPOS(); // Matching &apos;
  361. unsigned char MatchLT(); // Matching &lt;
  362. unsigned char MatchGT(); // Matching &gt;
  363. unsigned char MatchQUOT(); // Matching &quot;
  364. unsigned char EatTag(); // Eating an unknown tag.
  365. unsigned char DecodeNum(); // Decoding &#...number...;
  366. // Part of defunking is to convert all runs of whitespace into a single space.
  367. // It also doubles as the master output function once we're out of Store() mode.
  368. unsigned char SpaceConvChart[256]; // Space conversion chart.
  369. unsigned char LastReadOut; // Last ReadOut byte (for deduping spaces).
  370. unsigned char ReadOut(); // Read out the store through the filter.
  371. unsigned char LastGetStore; // Last GetStore byte (for EatTag).
  372. unsigned char GetStore(); // Read a byte from the store.
  373. // Here is a handy pattern match function for eliminating some tags.
  374. bool MatchTagPattern(const char* pattern) { // Matches pattern. True if matched.
  375. int pos = 2; // Now on the third byte (index 2).
  376. while(pattern[pos]){ // While we have more bytes to match
  377. unsigned char x = GetStore(); // grab the next byte.
  378. // Special case - HTML tag with a space as in <p stuff>
  379. if(x==' ' && pattern[pos]=='>') { // If we have a tag with parameters.
  380. pos++; // Move pos forward to it's null.
  381. while(GetStore()!='>')continue; // Eat up to the > and then
  382. break; // we are done.
  383. }
  384. // In the normal case follow the pattern.
  385. if(tolower(x)!=pattern[pos]) break; // If we fell off then stop.
  386. pos++; // If we didn't break move ahead.
  387. }
  388. // At this point we are either at the null in our pattern or we did not match.
  389. if(pattern[pos]) { return false; } // If we're not at the end then no match.
  390. return true; // Otherwise we do have a match :-)
  391. }
  392. // These are the function pointers that map the current state of this object.
  393. unsigned char (FilterChainDefunker::*Master)(); // Master function for GetByte()
  394. unsigned char (FilterChainDefunker::*Internal)(); // Internal function for GetByte()
  395. public:
  396. unsigned char GetByte() { // Overload the main fn().
  397. return (*this.*Master)(); // Call the master function.
  398. }
  399. unsigned char GetInternal() { // Internal state machine get.
  400. return (*this.*Internal)(); // Call the internal function.
  401. }
  402. FilterChainDefunker(FilterChain* S) // Sourced constructor...
  403. :FilterChain(S), // Call the base constructor.
  404. InputPosition(0), // Reset both position pointers.
  405. OutputPosition(0),
  406. LastRawByte(0),
  407. LastReadOut(0),
  408. LastGetStore(0),
  409. Master(&FilterChainDefunker::SkipHeaders), // Set the initial external and
  410. Internal(&FilterChainDefunker::Preamble) { // internal states.
  411. ClearQueue(); // Clear the queue;
  412. memset(StoreBuffer,0,sizeof(StoreBuffer)); // Clear the store buffer.
  413. for(int i=0;i<256;i++) SpaceConvChart[i]=i; // Initialize the chart.
  414. SpaceConvChart[(int)'\r']=' '; // Convert <CR> to space.
  415. SpaceConvChart[(int)'\n']=' '; // Convert <LF> to space.
  416. SpaceConvChart[(int)'\t']=' '; // Convert Tab to space.
  417. }
  418. FilterChainDefunker() { // Don't allow any
  419. throw BadSource("FilterChainDefunker: Source required"); // null constructors.
  420. }
  421. };
  422. // FilterChainUrlDecode
  423. // This module removes any unnecessary URL encoding within an <a...> tag. The
  424. // cleaned up version (if different) is emitted immediately after the original
  425. // <a...> tag so that both versions can be interpreted by the pattern scanner.
  426. // This is designed to eliminate common obfuscation techniques.
  427. const int UrlDecodeBfrSize = 256; // Decode Buffer Size.
  428. class FilterChainUrlDecode : public FilterChain {
  429. private:
  430. unsigned char DecodeBfr[UrlDecodeBfrSize]; // Decoded anchor buffer.
  431. unsigned int DecodeLength; // Decoded anchor length.
  432. unsigned int DecodePosition; // Read (Inject) Position.
  433. bool DecodeFlag; // True if the URL was decoded.
  434. void Clear() { // Function to clear the bfr.
  435. memset(DecodeBfr,0,sizeof(DecodeBfr)); // Null it out and set
  436. DecodeLength = 0; // the length to zero.
  437. DecodePosition = 0; // Reset the Read position.
  438. DecodeFlag = false; // Reset the Decode Flag.
  439. }
  440. void AddToBfr(unsigned char c) { // Safely add to our buffer.
  441. if(DecodeLength < sizeof(DecodeBfr)-1) // If we have more room then
  442. DecodeBfr[DecodeLength++] = c; // write the incoming byte.
  443. }
  444. unsigned char (FilterChainUrlDecode::*Internal)(); // Internal State Fn
  445. bool isHexDigit(unsigned char i); // Is i a hex digit?
  446. int convertHexDigit(unsigned char i); // Convert a single hex digit.
  447. unsigned char convertHexByte(unsigned char* x); // Convert a hex byte.
  448. // Here are the states of the UrlDecode module...
  449. unsigned char Bypass(); // Bypass - waiting for '<'
  450. unsigned char Tag(); // Looks for an 'a' or 'i' after '<'
  451. unsigned char Img1(); // Looks for 'm' in <img
  452. unsigned char Img2(); // Looks for 'g' in <img
  453. unsigned char Root(); // Root state of the decode FSM.
  454. unsigned char GetD1(); // Decoding step one.
  455. unsigned char GetD2(); // Decoding step two.
  456. unsigned char Inject(); // Injects the bfr into the stream.
  457. public:
  458. unsigned char GetByte() { // Overload the main fn().
  459. return (*this.*Internal)(); // Call the Internal function.
  460. }
  461. FilterChainUrlDecode(FilterChain* S) // Sourced constructor...
  462. :FilterChain(S), // Call the base constructor.
  463. Internal(&FilterChainUrlDecode::Bypass) { // Set ByPass mode.
  464. Clear(); // Clear the system.
  465. }
  466. FilterChainUrlDecode() { // Don't allow any
  467. throw BadSource("FilterChainUrlDecode: Source required"); // null constructors.
  468. }
  469. };
  470. // FilterChainHeaderAnalysis (and friends)
  471. // Performs header anomaly analysis and IP extraction and analysis.
  472. // IP Analysis is peformed via a provided class that implements the IPTester
  473. // interface. An IP is provided to the IPTester as a [#.#.#.#] string. The
  474. // IPTester may respond with information to be emitted into the headers for
  475. // the pattern matching engine based on those results --- or not ;-)
  476. class FilterChainIPTester {
  477. public:
  478. virtual string& test(string& input, string& output) = 0;
  479. };
  480. // The supplied test() function accepts the input string and returns the
  481. // output string. If desired, the output string can be modified to include
  482. // data from the tests that will be emitted into the data stream for the
  483. // pattern analysis engine to see. Otherwise, the output string should
  484. // remain blank. The test() function _should_ be thread safe -- that is why
  485. // we pass it both input and output ;-)
  486. //
  487. // The provided tester may have any side-effects that are desired.
  488. class FilterChainHeaderAnalysis : public FilterChain {
  489. private:
  490. unsigned char (FilterChainHeaderAnalysis::*Mode)(); // Internal State Fn Pointer (What Mode)
  491. FilterChainIPTester& IPTester; // This is the IP tester we use.
  492. string IPToTest; // String to capture IPs for testing.
  493. string IPTestResult; // String to receive IPtest results.
  494. // Header analysis output state...
  495. string EndOfHeaderResults; // String to capture EndOfHeaderResults.
  496. // OutputIndex and OutputLength are used to inject string data.
  497. // These are used to inject IPTestResult data and Header Analysis data.
  498. char* OutputBuffer; // Pointer to output injection string.
  499. int OutputIndex; // End of header output results index.
  500. void SetOutputBuffer(string& s); // Setup the OutputBuffer.
  501. unsigned char doInjectIPTestResult(); // Inject OutputBuffer and go to doSeekNL.
  502. unsigned char doInjectAnalysis(); // Inject OutputBuffer and go to doOff.
  503. // Header seek pattern state...
  504. // These tools work to follow patterns for header tags.
  505. // SetFollowPattern resets the engine and establishes the pattern to follow.
  506. // FollowPattern checks c against the next byte in the pattern.
  507. // -1 = The pattern failed.
  508. // 1 = The pattern was followed.
  509. // 0 = The pattern is complete.
  510. const char* MatchPattern; // Current pattern to match.
  511. int MatchIndex; // Pattern match following index.
  512. void SetFollowPattern(const char* p) { MatchPattern = p; MatchIndex = 0; } // Set the pattern to follow.
  513. int FollowPattern(char c); // Follow the pattern.
  514. //// Internal modes for this module...
  515. unsigned char doSeekNL(); // Looking for a new line.
  516. unsigned char doSeekDispatch(); // Looking at the first char after NL.
  517. unsigned char doReceived(); // Identifying a Received: header.
  518. unsigned char doFindIP(); // Seeking the [IP] in a Received header.
  519. unsigned char doTestIP(); // Gets and tests the [IP].
  520. unsigned char doFrom(); // Identifying a From: header.
  521. unsigned char doTo(); // Identifying a To: header.
  522. unsigned char doCC(); // Identifying a CC: header.
  523. unsigned char doMessageID(); // Identifying a MessageID header.
  524. unsigned char doDate(); // Identifying a Date: header.
  525. unsigned char doSubject(); // Identifying a Subject: header.
  526. unsigned char doEndOfHeaders(); // IdentifyEndOfHeaders & Emit Results.
  527. unsigned char doOff() { return FilterChain::GetByte(); } // Bypass mode.
  528. bool FoundFrom; // True if From: was found.
  529. bool FoundTo; // True if To: was found.
  530. bool FoundCC; // True if CC: was found.
  531. bool FoundMessageID; // True if Message-ID: was found.
  532. bool FoundDate; // True if Date: was found.
  533. bool FoundSubject; // True if Subject: was found.
  534. bool FoundHighBitCharacters; // True if high bit characters were found.
  535. unsigned char GetCheckedByte() { // Internal GetByte & check for high bits.
  536. unsigned char x = FilterChain::GetByte(); // Get the byte from up the chain.
  537. if(0 < (x & 0x80)) { // Check for a high bit byte (non-ascii).
  538. FoundHighBitCharacters = true; // If it is found then set the flag.
  539. } // If not then at least we checked ;-)
  540. return x; // Return the byte.
  541. }
  542. public:
  543. unsigned char GetByte() { // Overload the main fn().
  544. return (*this.*Mode)(); // Call the Internal function for this mode.
  545. }
  546. FilterChainHeaderAnalysis(FilterChain* S, FilterChainIPTester& T) : // Construct with the chain and a tester.
  547. FilterChain(S), // Capture the chain.
  548. Mode(&FilterChainHeaderAnalysis::doSeekDispatch), // Start in SeekDispatch() mode
  549. IPTester(T), // Capture the tester.
  550. IPToTest(""), // IPToTest and
  551. IPTestResult(""), // IPTestResult are both empty to start.
  552. FoundFrom(false), // Set all of the "found" bits to false.
  553. FoundTo(false),
  554. FoundCC(false),
  555. FoundMessageID(false),
  556. FoundDate(false),
  557. FoundSubject(false),
  558. FoundHighBitCharacters(false) {
  559. } // -- first byte of a new line ;-)
  560. bool MissingFrom() { return (!FoundFrom); } // True if missing From header.
  561. bool MissingTo() { return (!FoundTo); } // True if missing To header.
  562. bool MissingCC() { return (!FoundCC); } // True if missing CC header.
  563. bool MissingSubject() { return (!FoundSubject); } // True if missing Subject header.
  564. bool MissingDate() { return (!FoundDate); } // True if missing Date header.
  565. bool MissingMessageID() { return (!FoundDate); } // True if missing MessageID header.
  566. bool HighBitCharacters() { return (FoundHighBitCharacters); } // True if High bit characters were found.
  567. };
  568. #endif