Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760
  1. // FilterChain.hpp
  2. //
  3. // (C) 2002-2020 MicroNeil Research Corporation
  4. //
  5. // This is the base class header for FilterChain objects.
  6. // FilterChain objects can be chained together to filter
  7. // a byte stream. Each object produces a single character
  8. // per call. It will also call it's source object for the
  9. // next character as required.
  10. // History...
  11. // 20060822 _M
  12. // Adding FilterChainHeaderAnalysis to identify missing headers and header
  13. // anomalies, and to extract and test IP data.
  14. // 20060127 _M
  15. // Added FilterChainCBFG to accept a buffer of a specific
  16. // length.
  17. // 20041116 _M Added UrlDecode module. The module will repeat a decoded version of
  18. // any anchor tag that it sees which contains decodable %xx bytes. Other anchor
  19. // tags are not repeated.
  20. // 20041116 _M Upgrades to the Defunker module. The module now decodes any HTML
  21. // encoded bytes that could have been normal ascii.
  22. // 20041114 _M Completed basic defunker engine which strips out all HTML and some
  23. // basic   encoding.
  24. // 20041113 _M Began heavy upgrades to this module to improve performance and
  25. // provide additional obfuscation removal. This modification will include a move
  26. // from the use of switch(State) mechanisms to the use of function pointers. This
  27. // should save a few cycles on every byte processed.
  28. // 20021025 _M
  29. // Added FilterChainCString to accept a Null Terminated
  30. // String (CString). Except for the input form it operates
  31. // exactly like the FilterChainInput form as modified below.
  32. // This allows WebClay to deliver the message using a buffer
  33. // rather than a file.
  34. // 20021015 _M
  35. // Modified FilterChainInput to eat control characters and
  36. // <CR> bytes so that the input stream "appears" always to
  37. // be terminated in the *nix standard \n. Tabs are also passed
  38. // but all other low bytes are eaten.
  39. // 20020721 _M File Created.
  40. // This is the base class - nothing special happens here
  41. // except defining the basic format of a FilterChain object.
  42. // If this object is instantiated, then it will simply return
  43. // it's source's data, or a stream of '0's if none has been
  44. // defined.
  45. #pragma once
  46. #include <stdexcept>
  47. #include <iostream>
  48. #include <sstream>
  49. #include <string>
  50. #include <cstring>
  51. #include <cstdlib>
  52. #include <cctype>
  53. // Define parameters for this module.
  54. const static int ScanBufferSize = 128; // Define the buffer size.
  55. // Define the base class.
  56. class FilterChain {
  57. private:
  58. FilterChain* Source; // Where we get our data.
  59. public:
  60. class BadSource : public std::invalid_argument { // Bad Source Exception.
  61. public: BadSource(const std::string& w):invalid_argument(w){}
  62. };
  63. class Empty : public std::underflow_error { // Empty Exception.
  64. public: Empty(const std::string& w):underflow_error(w){}
  65. };
  66. virtual unsigned char GetByte() { // Return either 0
  67. if(NULL==Source) return 0; // if we have no source
  68. else return Source->GetByte(); // otherwise it's byte.
  69. }
  70. FilterChain(){Source=NULL;} // Default Constructor no source.
  71. // The next constructor throws an error if no source is defined.
  72. FilterChain(FilterChain* S) {
  73. if(NULL==S) throw BadSource("FilterChain: NULL source not valid");
  74. else Source = S;
  75. }
  76. virtual ~FilterChain() {} // Stop Warns about no virt dtor
  77. };
  78. // FilterChainInput
  79. // This version of FilterChain accepts an istream as a source and
  80. // gets a single character from it at each GetByte();
  81. class FilterChainInput : public FilterChain {
  82. private:
  83. std::istream* SourceIstream;
  84. public:
  85. // Here we overload the GetByte() function to get a byte
  86. // from the source stream. This is a litle bit special because
  87. // we're going to start our filtering process. Since we are
  88. // filtering text streams for pattern matching systems we will
  89. // eat any special control characters we get - including <CR>.
  90. // This helps us standardize on a *nix model for line ends as
  91. // each line end will be \n. It also gets rid of a lot of junk.
  92. unsigned char GetByte() { // Get the next byte.
  93. char i; // Keep it here.
  94. do{ // Loop to eat junk.
  95. SourceIstream->get(i); // Read the next byte...
  96. if(!SourceIstream->good()) // If something went wrong then
  97. throw Empty("FilterChain: No more data"); // throw the empty exception.
  98. if(i >= ' ') break; // Send all good bytes right away.
  99. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
  100. // Otherwise quietly eat anything
  101. } while(true); // less than a space.
  102. return i; // Return the latest byte...
  103. }
  104. // Here we overload the constructor to accept a stream.
  105. FilterChainInput(std::istream* S){ // Build me with a stream.
  106. if(NULL==S) throw BadSource("FilterChainInput: Null source not valid" ); // If it's NULL that's bad.
  107. if(!S->good()) throw BadSource("FilterChainInput: Bad istream"); // Not good is bad.
  108. else SourceIstream = S; // If it's good we keep it.
  109. }
  110. FilterChainInput() { // If we don't have a source then
  111. throw BadSource("FilterChainInput: Source required"); // we're no good.
  112. }
  113. };
  114. // FilterChainCString
  115. // This version sources the data for the chain from a message buffer, or
  116. // more precisely a null terminated string. The basic operation is identical
  117. // to that of FilterChainInput above except that we're not working with
  118. // a filestream as an input.
  119. class FilterChainCString : public FilterChain {
  120. private:
  121. unsigned char* InputBuffer;
  122. int BufferIndex;
  123. public:
  124. // Here we overload GetByte() just like we do in FilterChainInput
  125. // except that we're going to get our data from a NULL terminated
  126. // string instead of a stream. IN FACT ... the code below was simply
  127. // copied from FilterChainInput and modified in place.
  128. unsigned char GetByte() { // Get the next byte.
  129. unsigned char i; // Keep it here.
  130. do{ // Loop to eat junk.
  131. i = InputBuffer[BufferIndex++]; // Read the next byte...
  132. if(0 == i) // If there's nothing left then
  133. throw Empty("FilterChainCString: No more data"); // throw the empty exception.
  134. if(i >= ' ') break; // Send all good bytes right away.
  135. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
  136. // Otherwise quietly eat anything
  137. } while(true); // less than a space.
  138. return i; // Return the latest byte...
  139. }
  140. // Here we overload the constructor to accept a stream.
  141. FilterChainCString(unsigned char* S){ // Build me with a char buffer.
  142. if(NULL==S) throw BadSource("FilterChainCString: NULL source not valid"); // If it's NULL that's bad.
  143. if(0==S[0]) throw BadSource("FilterChainCString: Empty source not valid"); // Empty is bad.
  144. else InputBuffer = S; // If it's good we keep it.
  145. BufferIndex = 0; // Always start at index 0.
  146. }
  147. FilterChainCString() { // If we don't have a source then
  148. throw BadSource("FilterChainCString: Source required"); // we're no good.
  149. }
  150. };
  151. // FilterChainCBFR
  152. // This version sources the data for the chain from a message buffer, NOT
  153. // a null terminated string. The basic operation is identical to FilterChainCString
  154. // except that this version requires the length of the buffer and stops when that
  155. // number of characters have been read.
  156. class FilterChainCBFR : public FilterChain {
  157. private:
  158. unsigned char* InputBuffer;
  159. unsigned int BufferLength;
  160. unsigned int BufferIndex;
  161. std::stringstream& PrependedHeaders;
  162. bool PrependNotBuffer;
  163. public:
  164. // Here we overload GetByte() just like we do in FilterChainInput
  165. // except that we're going to get our data from a known length char
  166. // buffer instead of a stream. IN FACT ... the code below was simply
  167. // copied from FilterChainCString and modified in place.
  168. unsigned char GetByte() { // Get the next byte.
  169. unsigned char i; // Keep it here.
  170. if(PrependNotBuffer) { // While in prepend mode:
  171. if(BufferIndex < PrependedHeaders.str().length()) { // If there is more to get
  172. i = PrependedHeaders.str().at(BufferIndex); // then get it and move
  173. ++BufferIndex; // the index.
  174. } else { // As soon as we run out
  175. PrependNotBuffer = false; // of prepended headers switch
  176. BufferIndex = 0; // to the CBFR and reset the index.
  177. return GetByte(); // Recurse to get the next byte.
  178. }
  179. } else { // While in buffer mode:
  180. do{ // Loop to eat junk.
  181. if(BufferLength <= BufferIndex) // If there's nothing left then
  182. throw Empty("FilterChainCBFR: No more data"); // throw the empty exception.
  183. i = InputBuffer[BufferIndex++]; // Read the next byte...
  184. if(i >= ' ') break; // Send all good bytes right away.
  185. if(i=='\n' || i=='\t') break; // If we hit a \n or \t send it.
  186. // Otherwise quietly eat anything
  187. } while(true); // less than a space.
  188. }
  189. return i; // Return the latest byte...
  190. }
  191. // Here we overload the constructor to accept a stream.
  192. FilterChainCBFR(unsigned char* S, int l, std::stringstream& P) : // Give me a bfr and a stringstream.
  193. InputBuffer(S), // Grab the buffer,
  194. BufferLength(l), // Grab the buffer length,
  195. BufferIndex(0), // Initialize the index to 0,
  196. PrependedHeaders(P), // Grab the PrependedHeaders reference.
  197. PrependNotBuffer(true) { // Do PrependedHeaders first.
  198. if(NULL==S) throw BadSource("FilterChainCBFR: NULL source not valid"); // If it's NULL that's bad.
  199. if(0==l && 0==P.str().length())
  200. throw BadSource("FilterChainCBFR: Empty source not valid"); // Empty is bad.
  201. }
  202. };
  203. // FilterChainBase64
  204. // This version decodes base64 content in email messages. It begins
  205. // to decode this as soon as it sees the following message and two
  206. // blank lines indicating the coding has started.
  207. //
  208. // Content-Transfer-Encoding: base64
  209. //
  210. // Once it sees a bad character or what appears to be the start of
  211. // a new MIME segment, the filter turns off and passes through it's
  212. // source data.
  213. // The startup string for this filter is below. In this case we keep the
  214. // <LF> part of the string to ensure we will be looking at the start
  215. // of a line when we match.
  216. const static unsigned char Base64Start[] = "\nContent-Transfer-Encoding: base64";
  217. // The following table makes conversion fast because it's all lookups. The
  218. // special value XX64 is used everywhere a bad byte is found in the table.
  219. const static unsigned char XX64 = 0xFF;
  220. // Note the special case '=' is used for pad. It is given the value 0x00.
  221. // The input to this table is the incoming byte. The output is either XX64
  222. // or a valid base64 numerical value.
  223. const static unsigned char Base64Table[256] = {
  224. // 0 1 2 3 4 5 6 7 8 9 A B C D E F
  225. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 0
  226. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 1
  227. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,0x3E,XX64,XX64,XX64,0x3F, // 2
  228. 0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,XX64,XX64,XX64,0x00,XX64,XX64, // 3
  229. XX64,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E, // 4
  230. 0x0F,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,XX64,XX64,XX64,XX64,XX64, // 5
  231. XX64,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28, // 6
  232. 0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,0x30,0x31,0x32,0x33,XX64,XX64,XX64,XX64,XX64, // 7
  233. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 8
  234. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // 9
  235. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // A
  236. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // B
  237. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // C
  238. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // D
  239. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64, // E
  240. XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64,XX64 // F
  241. };
  242. // The following constants are used to find segment positions when converting from
  243. // 4 six bit values to 3 octets.
  244. const static unsigned char base64_seg0_shift = 18;
  245. const static unsigned char base64_seg1_shift = 12;
  246. const static unsigned char base64_seg2_shift = 6;
  247. const static unsigned char base64_seg3_shift = 0;
  248. class FilterChainBase64 : public FilterChain {
  249. private:
  250. unsigned char x,y; // We need a few holding bins.
  251. unsigned int Workspace; // Numerical workspace for conversion.
  252. enum FilterState { // Operating State Codes.
  253. SCANNING, // One-in = One-out, looking for startup.
  254. DEQUEING, // Delivering buffered data.
  255. DECODING // Delivering filtered data.
  256. } State;
  257. unsigned int ScanIx; // Scanning Index.
  258. unsigned int DequeIx; // Dequeing Index.
  259. unsigned char Buffer; // Define a buffer.
  260. bool ValidBuffer; // Set if Buffer has data.
  261. bool ValidByte(unsigned char y); // True if y can be decoded.
  262. public:
  263. unsigned char GetByte(); // Overload the main fn().
  264. FilterChainBase64(FilterChain* S) // Sourced constructor...
  265. :FilterChain(S){ // Call the base constructor.
  266. State = SCANNING; // Set filter inactive.
  267. ScanIx=DequeIx=0; // Reset our indexes.
  268. } // We're all ready to start.
  269. FilterChainBase64() { // Don't allow any
  270. throw BadSource("FilterChainBase64: Source required"); // null constructors.
  271. }
  272. };
  273. // FilterChainQuotedPrintable
  274. // This version decodes quoted-printable content in email messages.
  275. //
  276. // For simplicity this one is always on. That is, whenever it sees a
  277. // convertable quoted printable byte it will exchange it for the byte
  278. // that is represented. This is only intended for operation preceeding the
  279. // spam filter engine so it is safe to make these conversions.
  280. class FilterChainQuotedPrintable : public FilterChain {
  281. private:
  282. long int Workspace; // Plain Text Workspace.
  283. enum FilterState { // Operating State Codes
  284. SCANNING, // One-in = One-out - looking for start-up.
  285. DEQUEING, // Delivering buffered data.
  286. DECODING // Delivering filtered data.
  287. } State;
  288. int BufferLength; // How full is the buffer.
  289. int BufferIndex; // What byte are we on?
  290. unsigned char Buffer[ScanBufferSize]; // Define the buffer.
  291. bool isHexDigit(unsigned char i); // true if i is a hex digit byte.
  292. int convertHexDigit(unsigned char i); // returns integer value of hex digit i.
  293. public:
  294. unsigned char GetByte(); // Overload the main fn().
  295. FilterChainQuotedPrintable(FilterChain* S) // Sourced constructor...
  296. :FilterChain(S){ // Call the base constructor.
  297. State = SCANNING; // Set to the initial state.
  298. BufferIndex = 0; // Initial buffer index.
  299. BufferLength = 0; // Initial buffer length.
  300. Workspace = 0; // Clear the workspace.
  301. }
  302. FilterChainQuotedPrintable() { // Don't allow any
  303. throw BadSource("FilterChainQuotedPrintable: Source required"); // null constructors.
  304. }
  305. };
  306. // FilterChainDefunker
  307. // This module stores a copy of the stream containing HTML and then emits it
  308. // at the end of the stream with all of the html elements removed and/or decoded
  309. // to eliminate html based obfuscation.
  310. class FilterChainDefunker;
  311. static const int DefunkerSize = 32768; // Store size.
  312. static const int DefunkerQueueSize = 24; // Size of defunker queue.
  313. class FilterChainDefunker : public FilterChain { // Class definition.
  314. private:
  315. // Occasionally when parsing a chunk of data we must return nothing and
  316. // instead try again for the next character. Instead of resursing we can
  317. // set this flag and the root state will simply try again in a loop.
  318. bool ReturnNothing; // Set true to skip this round;
  319. // Storeage
  320. unsigned char StoreBuffer[DefunkerSize];
  321. int InputPosition;
  322. int OutputPosition;
  323. // Nodes in the state change model are represented by functions.
  324. // These modes represent the state prior to getting the Empty exception.
  325. // During this mode, the Defunker simply stores a portion of the message
  326. // to be scanned later.
  327. unsigned char LastRawByte; // Last Raw Byte (for SkipHeaders);
  328. unsigned char SkipHeaders(); // Skips the headers before Store();
  329. unsigned char Store(); // Stores the message content for later.
  330. // Here is a handy Queue mechanism for recovering failed patterns.
  331. int QueueLength; // Queue Length (write position).
  332. int QueuePosition; // Queue Read Position.
  333. unsigned char Qbfr[DefunkerQueueSize]; // Queue Buffer.
  334. void ClearQueue() { // Clear the queue.
  335. memset(Qbfr,0,sizeof(Qbfr)); // Reset the buffer.
  336. QueueLength = 0; // Zero the length.
  337. QueuePosition = 0; // Zero the position.
  338. }
  339. unsigned char DeQueue() { // Empty the queue then back to DefunkRoot.
  340. if(QueuePosition >= QueueLength) { // If the queue is empty then
  341. ClearQueue(); // clear the queue,
  342. Internal = &FilterChainDefunker::DefunkRoot; // go back to DefunkRoot mode,
  343. return GetInternal(); // and return the next byte.
  344. } // If the queue is not empty then
  345. return Qbfr[QueuePosition++]; // return the next byte from the queue.
  346. }
  347. void EnQueue(unsigned char x) { // Add a byte to the queue.
  348. if(QueueLength<DefunkerQueueSize) // If we are safely within the buffer
  349. Qbfr[QueueLength++] = x; // then add this byte to the queue.
  350. }
  351. // These modes represent the Defunker pulling data out of it's
  352. // stored copy so that it can be filtered and delivered to the scanner.
  353. // These modes get turned on once the Empty exception is read from
  354. // the underlying source.
  355. unsigned char Preamble(); // Preamble - separates Defunked text.
  356. unsigned char DefunkRoot(); // Root in Defunk mode.
  357. unsigned char OpenTag(); // Open tag detected.
  358. unsigned char OpenAmp(); // Open & tag.
  359. unsigned char MatchBR(); // Matching <br>
  360. unsigned char MatchP(); // Matching <p>
  361. unsigned char MatchNBSP(); // Matching &nbps;
  362. unsigned char SwitchAMPAPOS(); // Looking for AMP or APOS.
  363. unsigned char MatchAMP(); // Matching &amp;
  364. unsigned char MatchAPOS(); // Matching &apos;
  365. unsigned char MatchLT(); // Matching &lt;
  366. unsigned char MatchGT(); // Matching &gt;
  367. unsigned char MatchQUOT(); // Matching &quot;
  368. unsigned char EatTag(); // Eating an unknown tag.
  369. unsigned char DecodeNum(); // Decoding &#...number...;
  370. // Part of defunking is to convert all runs of whitespace into a single space.
  371. // It also doubles as the master output function once we're out of Store() mode.
  372. unsigned char SpaceConvChart[256]; // Space conversion chart.
  373. unsigned char LastReadOut; // Last ReadOut byte (for deduping spaces).
  374. unsigned char ReadOut(); // Read out the store through the filter.
  375. unsigned char LastGetStore; // Last GetStore byte (for EatTag).
  376. unsigned char GetStore(); // Read a byte from the store.
  377. // Here is a handy pattern match function for eliminating some tags.
  378. bool MatchTagPattern(const char* pattern) { // Matches pattern. True if matched.
  379. int pos = 2; // Now on the third byte (index 2).
  380. while(pattern[pos]){ // While we have more bytes to match
  381. unsigned char x = GetStore(); // grab the next byte.
  382. // Special case - HTML tag with a space as in <p stuff>
  383. if(x==' ' && pattern[pos]=='>') { // If we have a tag with parameters.
  384. pos++; // Move pos forward to it's null.
  385. while(GetStore()!='>')continue; // Eat up to the > and then
  386. break; // we are done.
  387. }
  388. // In the normal case follow the pattern.
  389. if(tolower(x)!=pattern[pos]) break; // If we fell off then stop.
  390. pos++; // If we didn't break move ahead.
  391. }
  392. // At this point we are either at the null in our pattern or we did not match.
  393. if(pattern[pos]) { return false; } // If we're not at the end then no match.
  394. return true; // Otherwise we do have a match :-)
  395. }
  396. // These are the function pointers that map the current state of this object.
  397. unsigned char (FilterChainDefunker::*Master)(); // Master function for GetByte()
  398. unsigned char (FilterChainDefunker::*Internal)(); // Internal function for GetByte()
  399. public:
  400. unsigned char GetByte() { // Overload the main fn().
  401. return (*this.*Master)(); // Call the master function.
  402. }
  403. unsigned char GetInternal() { // Internal state machine get.
  404. return (*this.*Internal)(); // Call the internal function.
  405. }
  406. FilterChainDefunker(FilterChain* S) // Sourced constructor...
  407. :FilterChain(S), // Call the base constructor.
  408. InputPosition(0), // Reset both position pointers.
  409. OutputPosition(0),
  410. LastRawByte(0),
  411. LastReadOut(0),
  412. LastGetStore(0),
  413. Master(&FilterChainDefunker::SkipHeaders), // Set the initial external and
  414. Internal(&FilterChainDefunker::Preamble) { // internal states.
  415. ClearQueue(); // Clear the queue;
  416. memset(StoreBuffer,0,sizeof(StoreBuffer)); // Clear the store buffer.
  417. for(int i=0;i<256;i++) SpaceConvChart[i]=i; // Initialize the chart.
  418. SpaceConvChart[(int)'\r']=' '; // Convert <CR> to space.
  419. SpaceConvChart[(int)'\n']=' '; // Convert <LF> to space.
  420. SpaceConvChart[(int)'\t']=' '; // Convert Tab to space.
  421. }
  422. FilterChainDefunker() { // Don't allow any
  423. throw BadSource("FilterChainDefunker: Source required"); // null constructors.
  424. }
  425. };
  426. // FilterChainUrlDecode
  427. // This module removes any unnecessary URL encoding within an <a...> tag. The
  428. // cleaned up version (if different) is emitted immediately after the original
  429. // <a...> tag so that both versions can be interpreted by the pattern scanner.
  430. // This is designed to eliminate common obfuscation techniques.
  431. const int UrlDecodeBfrSize = 256; // Decode Buffer Size.
  432. class FilterChainUrlDecode : public FilterChain {
  433. private:
  434. unsigned char DecodeBfr[UrlDecodeBfrSize]; // Decoded anchor buffer.
  435. unsigned int DecodeLength; // Decoded anchor length.
  436. unsigned int DecodePosition; // Read (Inject) Position.
  437. bool DecodeFlag; // True if the URL was decoded.
  438. void Clear() { // Function to clear the bfr.
  439. memset(DecodeBfr,0,sizeof(DecodeBfr)); // Null it out and set
  440. DecodeLength = 0; // the length to zero.
  441. DecodePosition = 0; // Reset the Read position.
  442. DecodeFlag = false; // Reset the Decode Flag.
  443. }
  444. void AddToBfr(unsigned char c) { // Safely add to our buffer.
  445. if(DecodeLength < sizeof(DecodeBfr)-1) // If we have more room then
  446. DecodeBfr[DecodeLength++] = c; // write the incoming byte.
  447. }
  448. unsigned char (FilterChainUrlDecode::*Internal)(); // Internal State Fn
  449. bool isHexDigit(unsigned char i); // Is i a hex digit?
  450. int convertHexDigit(unsigned char i); // Convert a single hex digit.
  451. unsigned char convertHexByte(unsigned char* x); // Convert a hex byte.
  452. // Here are the states of the UrlDecode module...
  453. unsigned char Bypass(); // Bypass - waiting for '<'
  454. unsigned char Tag(); // Looks for an 'a' or 'i' after '<'
  455. unsigned char Img1(); // Looks for 'm' in <img
  456. unsigned char Img2(); // Looks for 'g' in <img
  457. unsigned char Root(); // Root state of the decode FSM.
  458. unsigned char GetD1(); // Decoding step one.
  459. unsigned char GetD2(); // Decoding step two.
  460. unsigned char Inject(); // Injects the bfr into the stream.
  461. public:
  462. unsigned char GetByte() { // Overload the main fn().
  463. return (*this.*Internal)(); // Call the Internal function.
  464. }
  465. FilterChainUrlDecode(FilterChain* S) // Sourced constructor...
  466. :FilterChain(S), // Call the base constructor.
  467. Internal(&FilterChainUrlDecode::Bypass) { // Set ByPass mode.
  468. Clear(); // Clear the system.
  469. }
  470. FilterChainUrlDecode() { // Don't allow any
  471. throw BadSource("FilterChainUrlDecode: Source required"); // null constructors.
  472. }
  473. };
  474. // FilterChainHeaderAnalysis (and friends)
  475. // Performs header anomaly analysis and IP extraction and analysis.
  476. // IP Analysis is peformed via a provided class that implements the IPTester
  477. // interface. An IP is provided to the IPTester as a [#.#.#.#] string. The
  478. // IPTester may respond with information to be emitted into the headers for
  479. // the pattern matching engine based on those results --- or not ;-)
  480. class FilterChainIPTester {
  481. public:
  482. virtual std::string& test(std::string& input, std::string& output) = 0;
  483. };
  484. // The supplied test() function accepts the input string and returns the
  485. // output string. If desired, the output string can be modified to include
  486. // data from the tests that will be emitted into the data stream for the
  487. // pattern analysis engine to see. Otherwise, the output string should
  488. // remain blank. The test() function _should_ be thread safe -- that is why
  489. // we pass it both input and output ;-)
  490. //
  491. // The provided tester may have any side-effects that are desired.
  492. class FilterChainHeaderAnalysis : public FilterChain {
  493. private:
  494. unsigned char (FilterChainHeaderAnalysis::*Mode)(); // Internal State Fn Pointer (What Mode)
  495. FilterChainIPTester& IPTester; // This is the IP tester we use.
  496. std::string IPToTest; // String to capture IPs for testing.
  497. std::string IPTestResult; // String to receive IPtest results.
  498. // Header analysis output state...
  499. std::string EndOfHeaderResults; // String to capture EndOfHeaderResults.
  500. // OutputIndex and OutputLength are used to inject string data.
  501. // These are used to inject IPTestResult data and Header Analysis data.
  502. char* OutputBuffer; // Pointer to output injection string.
  503. int OutputIndex; // End of header output results index.
  504. void SetOutputBuffer(std::string& s); // Setup the OutputBuffer.
  505. unsigned char doInjectIPTestResult(); // Inject OutputBuffer and go to doSeekNL.
  506. unsigned char doInjectAnalysis(); // Inject OutputBuffer and go to doOff.
  507. // Header seek pattern state...
  508. // These tools work to follow patterns for header tags.
  509. // SetFollowPattern resets the engine and establishes the pattern to follow.
  510. // FollowPattern checks c against the next byte in the pattern.
  511. // -1 = The pattern failed.
  512. // 1 = The pattern was followed.
  513. // 0 = The pattern is complete.
  514. const char* MatchPattern; // Current pattern to match.
  515. int MatchIndex; // Pattern match following index.
  516. void SetFollowPattern(const char* p) { MatchPattern = p; MatchIndex = 0; } // Set the pattern to follow.
  517. int FollowPattern(char c); // Follow the pattern.
  518. //// Internal modes for this module...
  519. unsigned char doSeekNL(); // Looking for a new line.
  520. unsigned char doSeekDispatch(); // Looking at the first char after NL.
  521. unsigned char doReceived(); // Identifying a Received: header.
  522. unsigned char doFindIP(); // Seeking the [IP] in a Received header.
  523. unsigned char doTestIP(); // Gets and tests the [IP].
  524. unsigned char doFrom(); // Identifying a From: header.
  525. unsigned char doTo(); // Identifying a To: header.
  526. unsigned char doCC(); // Identifying a CC: header.
  527. unsigned char doMessageID(); // Identifying a MessageID header.
  528. unsigned char doDate(); // Identifying a Date: header.
  529. unsigned char doSubject(); // Identifying a Subject: header.
  530. unsigned char doEndOfHeaders(); // IdentifyEndOfHeaders & Emit Results.
  531. unsigned char doOff() { return FilterChain::GetByte(); } // Bypass mode.
  532. bool FoundFrom; // True if From: was found.
  533. bool FoundTo; // True if To: was found.
  534. bool FoundCC; // True if CC: was found.
  535. bool FoundMessageID; // True if Message-ID: was found.
  536. bool FoundDate; // True if Date: was found.
  537. bool FoundSubject; // True if Subject: was found.
  538. bool FoundHighBitCharacters; // True if high bit characters were found.
  539. unsigned char GetCheckedByte() { // Internal GetByte & check for high bits.
  540. unsigned char x = FilterChain::GetByte(); // Get the byte from up the chain.
  541. if(0 < (x & 0x80)) { // Check for a high bit byte (non-ascii).
  542. FoundHighBitCharacters = true; // If it is found then set the flag.
  543. } // If not then at least we checked ;-)
  544. return x; // Return the byte.
  545. }
  546. public:
  547. unsigned char GetByte() { // Overload the main fn().
  548. return (*this.*Mode)(); // Call the Internal function for this mode.
  549. }
  550. FilterChainHeaderAnalysis(FilterChain* S, FilterChainIPTester& T) : // Construct with the chain and a tester.
  551. FilterChain(S), // Capture the chain.
  552. Mode(&FilterChainHeaderAnalysis::doSeekDispatch), // Start in SeekDispatch() mode
  553. IPTester(T), // Capture the tester.
  554. IPToTest(""), // IPToTest and
  555. IPTestResult(""), // IPTestResult are both empty to start.
  556. FoundFrom(false), // Set all of the "found" bits to false.
  557. FoundTo(false),
  558. FoundCC(false),
  559. FoundMessageID(false),
  560. FoundDate(false),
  561. FoundSubject(false),
  562. FoundHighBitCharacters(false) {
  563. } // -- first byte of a new line ;-)
  564. bool MissingFrom() { return (!FoundFrom); } // True if missing From header.
  565. bool MissingTo() { return (!FoundTo); } // True if missing To header.
  566. bool MissingCC() { return (!FoundCC); } // True if missing CC header.
  567. bool MissingSubject() { return (!FoundSubject); } // True if missing Subject header.
  568. bool MissingDate() { return (!FoundDate); } // True if missing Date header.
  569. bool MissingMessageID() { return (!FoundDate); } // True if missing MessageID header.
  570. bool HighBitCharacters() { return (FoundHighBitCharacters); } // True if High bit characters were found.
  571. };