You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

roller.hpp 3.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. // roller.hpp
  2. // Copyright (C) 2022 MicroNeil Research Corporation.
  3. //
  4. // This software is released under the MIT license. See LICENSE.TXT.
  5. //
  6. // Roller is a naive rolling hash system for rapid string searches.
  7. // Roller32 accommodates matches up to 4 bytes.
  8. // Roller64 accommodates matches up to 8 bytes.
  9. #pragma once
  10. #include <cstdint>
  11. #include <ctype.h>
  12. #include <string>
  13. #include <vector>
  14. #include <iostream>
  15. #include <bitset>
  16. namespace codedweller {
  17. class Roller32 {
  18. private:
  19. uint_fast32_t roller;
  20. public:
  21. Roller32() : roller(0) {}
  22. uint_fast32_t value() const { return roller; }
  23. uint_fast32_t add(uint8_t byte) {
  24. roller = roller << 8 | byte;
  25. return roller;
  26. }
  27. };
  28. class Roller64 {
  29. private:
  30. uint_fast64_t roller;
  31. public:
  32. Roller64() : roller(0) {}
  33. uint_fast64_t value() const { return roller; }
  34. uint_fast64_t add(uint8_t byte) {
  35. roller = roller << 8 | byte;
  36. return roller;
  37. }
  38. };
  39. namespace roller{
  40. static const unsigned char caseBit = 'A' ^ 'a';
  41. static const unsigned char caseMask = ~caseBit;
  42. unsigned char applyCaseSensitivity(unsigned char byte, bool caseSensitive) {
  43. if (true==caseSensitive) return byte;
  44. if (false==isalpha(byte) && 0xFF != byte) return byte;
  45. unsigned char insensitiveByte = byte & caseMask;
  46. return insensitiveByte;
  47. }
  48. unsigned char appropriateBitMask(unsigned char byte, bool caseSensitive) {
  49. if (true==caseSensitive) return 0xff;
  50. if (false==isalpha(byte)) return 0xff;
  51. return caseMask;
  52. }
  53. template <class D, class T>
  54. void compute(D data, bool caseSensitive, T& matcher, T& masker) {
  55. size_t ingest = std::min(sizeof(matcher.value()), data.size());
  56. for(size_t count = 0; count < ingest; count++) {
  57. unsigned char theByte = data.at(count);
  58. matcher.add(roller::applyCaseSensitivity(theByte, caseSensitive));
  59. masker.add(roller::appropriateBitMask(theByte, caseSensitive));
  60. }
  61. }
  62. }
  63. class RollerMatch32 {
  64. private:
  65. uint_fast32_t match;
  66. uint_fast32_t mask;
  67. public:
  68. RollerMatch32(const std::vector<unsigned char> pattern, bool caseSensitive=true) {
  69. Roller32 matcher, masker;
  70. roller::compute(pattern, caseSensitive, matcher, masker);
  71. match = matcher.value();
  72. mask = masker.value();
  73. }
  74. RollerMatch32(const std::string pattern, bool caseSensitive=true) {
  75. Roller32 matcher, masker;
  76. roller::compute(pattern, caseSensitive, matcher, masker);
  77. match = matcher.value();
  78. mask = masker.value();
  79. }
  80. bool matches(const Roller32 roller) {
  81. return (match == (roller.value() & mask));
  82. }
  83. };
  84. class RollerMatch64 {
  85. private:
  86. uint_fast64_t match;
  87. uint_fast64_t mask;
  88. public:
  89. RollerMatch64(const std::vector<unsigned char> pattern, bool caseSensitive=true) {
  90. Roller64 matcher, masker;
  91. roller::compute(pattern, caseSensitive, matcher, masker);
  92. match = matcher.value();
  93. mask = masker.value();
  94. }
  95. RollerMatch64(const std::string pattern, bool caseSensitive=true) {
  96. Roller64 matcher, masker;
  97. roller::compute(pattern, caseSensitive, matcher, masker);
  98. match = matcher.value();
  99. mask = masker.value();
  100. }
  101. bool matches(const Roller64 roller) {
  102. return (match == (roller.value() & mask));
  103. }
  104. };
  105. }