Merge pull request #773 from jettero/fix-py3-hec-dq

Tenebriso · web-flow · commit 1f455dc7e676 · 2020-01-06T13:58:38.000+02:00
Fix py3 hec dq
diff --git a/.pipeline b/.pipeline
@@ -1,5 +1,5 @@
 
-def imgname = 'hubblestack/jenkins:centos-v1.0.8'
+def imgname = 'hubblestack/jenkins:centos-v1.0.9'
 
 pipeline {
     agent { docker { image "${imgname}" } }
@@ -13,6 +13,7 @@ pipeline {
     environment {
         PY_COLORS = 1
         HS_PROFILE = 1
+        PY_V = '3.6.10'
     }
 
     stages {
diff --git a/hubblestack/hec/dq.py b/hubblestack/hec/dq.py
@@ -7,6 +7,8 @@
 import shutil
 import json
 from collections import deque
+from hubblestack.utils.misc import numbered_file_split_key
+from hubblestack.utils.encoding import encode_something_to_bytes, decode_something_to_string
 
 __all__ = [
     'QueueTypeError', 'QueueCapacityError', 'MemQueue', 'DiskQueue',
@@ -37,6 +39,7 @@ def check_type(self, item):
         if not isinstance(item, self.ok_types):
             raise QueueTypeError('type({0}) is not ({1})'.format(type(item), self.ok_types))
 
+
 class NoQueue(object):
     cn = 0
     def put(self, *a, **kw):
@@ -69,6 +72,7 @@ def __bool__(self):
     __nonzero__ = __bool__ # stupid python2
 
     def compress(self, dat):
+        dat = encode_something_to_bytes(dat)
         if not self.compression:
             return dat
         def _bz2(x):
@@ -84,7 +88,8 @@ def unlink_(self, fname):
                 os.unlink(name)
 
     def decompress(self, dat):
-        if str(dat).startswith('BZ'):
+        dat = encode_something_to_bytes(dat)
+        if dat.startswith(b'BZ'):
             try:
                 return bz2.BZ2Decompressor().decompress(dat)
             except IOError:
@@ -131,8 +136,6 @@ def put(self, item, **meta):
         f = os.path.join(d, remainder)
         with open(f, 'wb') as fh:
             log.debug('writing item to disk cache')
-            if isinstance(bstr, str):
-                bstr = str.encode(bstr)
             fh.write(bstr)
         if meta:
             with open(f + '.meta', 'w') as fh:
@@ -160,23 +163,23 @@ def peek(self):
         """
         for fname in self.files:
             with open(fname, 'rb') as fh:
-                return self.decompress(fh.read()), self.read_meta(fname)
+                return decode_something_to_string(self.decompress(fh.read())), self.read_meta(fname)
 
     def get(self):
         """ get the next item from the queue
             returns: data_octets, meta_data_dict
         """
         for fname in self.files:
             with open(fname, 'rb') as fh:
-                ret = self.decompress(fh.read())
-            ret = ret, self.read_meta(fname)
+                dat = self.decompress(fh.read())
+            mdat = self.read_meta(fname)
             sz = os.stat(fname).st_size
             self.unlink_(fname)
             self.cn -= 1
             self.sz -= sz
             if self.double_check_cnsz:
                 self._count(double_check_only=True, tag='get')
-            return ret
+            return decode_something_to_string(dat), mdat
 
     def getz(self, sz=SPLUNK_MAX_MSG):
         """ fetch items from the queue and concatenate them together using the
@@ -219,7 +222,7 @@ def getz(self, sz=SPLUNK_MAX_MSG):
             #
             # occasionally this will return something pessimistic
             meta_data[k] = max(meta_data[k])
-        return ret, meta_data
+        return decode_something_to_string(ret), meta_data
 
     def pop(self):
         """ remove the next item from the queue (do not return it); useful with .peek() """
@@ -235,14 +238,8 @@ def pop(self):
     @property
     def files(self):
         """ generate all filenames in the diskqueue (returns iterable) """
-        def _k(x):
-            try:
-                return [int(i) for i in x.split('.')]
-            except:
-                pass
-            return x
         for path, dirs, files in sorted(os.walk(self.directory)):
-            for fname in [os.path.join(path, f) for f in sorted(files, key=_k)]:
+            for fname in [os.path.join(path, f) for f in sorted(files, key=numbered_file_split_key)]:
                 if fname.endswith('.meta'):
                     continue
                 yield fname
diff --git a/hubblestack/hec/obj.py b/hubblestack/hec/obj.py
@@ -18,6 +18,7 @@
 
 from . dq import DiskQueue, NoQueue, QueueCapacityError
 from hubblestack.utils.stdrec import update_payload
+from hubblestack.utils.encoding import encode_something_to_bytes
 
 __version__ = '1.0'
 
@@ -259,7 +260,7 @@ def __init__(self, token, index, http_event_server, host='', http_event_port='80
             md5 = hashlib.md5()
             uril = sorted([ x.uri for x in self.server_uri ])
             for u in uril:
-                md5.update(u)
+                md5.update(encode_something_to_bytes(u))
             actual_disk_queue = os.path.join(disk_queue, md5.hexdigest())
             log.debug("disk_queue for %s: %s", uril, actual_disk_queue)
             self.queue = DiskQueue(actual_disk_queue, size=disk_queue_size, compression=disk_queue_compression)
@@ -299,12 +300,12 @@ def _queue_event(self, payload, meta_data=None):
             log.error("disk queue is full, dropping payload")
 
 
-    def queueEvent(self, dat, eventtime=''):
+    def queueEvent(self, dat, eventtime='', no_queue=False):
         if not isinstance(dat, Payload):
             dat = Payload(dat, eventtime, no_queue=no_queue)
         if dat.no_queue: # here you silly hec, queue this no_queue payload...
             return
-        count_input(payload)
+        count_input(dat)
         self._queue_event(dat)
 
     def flushQueue(self):
diff --git a/hubblestack/utils/encoding.py b/hubblestack/utils/encoding.py
@@ -40,3 +40,15 @@ def encode_base64(starting_string, format_chained=True, chained=None, chained_st
         ret = base64.b64encode(starting_string)
 
     return bool(ret), ret
+
+def encode_something_to_bytes(x):
+    """ take strings or bytes or whatever and convert to bytes """
+    if isinstance(x, (bytes,bytearray)):
+        return x
+    return x.encode('utf-8')
+
+def decode_something_to_string(x):
+    """ take strings or bytes or whatever and convert to string """
+    if isinstance(x, (bytes,bytearray)):
+        return x.decode('utf-8')
+    return x
diff --git a/hubblestack/utils/misc.py b/hubblestack/utils/misc.py
@@ -0,0 +1,18 @@
+# coding: utf-8
+
+def numbered_file_split_key(x):
+    """ for sorting purposes, split filenames like '238048.11', '238048.17',
+        '238048.0' into lists of integers.  E.g.:
+
+        for fname in sorted(filenames, key=numbered_file_split_key):
+            do_things_ordered_by_integer_sort()
+    """
+    try:
+        return [int(i) for i in x.split('.')]
+    except:
+        pass
+    try:
+        return [int(x)]
+    except:
+        pass
+    return list()
diff --git a/tests/unittests/test_hec_dq.py b/tests/unittests/test_hec_dq.py
@@ -1,3 +1,5 @@
+# coding: utf-8
+
 import pytest
 import os
 
@@ -12,20 +14,33 @@ def samp():
 
 @pytest.fixture
 def dq():
-    return DiskQueue(TEST_DQ_DIR, size=100, fresh=True)
+    return DiskQueue(TEST_DQ_DIR, fresh=True)
 
-def test_disk_queue(dq):
+@pytest.fixture
+def dqc():
+    return DiskQueue(TEST_DQ_DIR + ".bz2", fresh=True, compression=9)
+
+def _test_disk_queue(dq):
     borked = False
 
     dq.put('one', testinator=3)
     dq.put('two', testinator=4)
     dq.put('three', testinator=5)
 
-    assert len(dq) == 13
-    assert dq.peek() == (b'one', {'testinator': 3})
-    assert dq.get() == (b'one', {'testinator': 3})
-    assert dq.peek() == (b'two', {'testinator': 4})
-    assert len(dq) == 9
+    if not dq.compression:
+        # NOTE: with the huffman headers (or whatever), the size of the dq is
+        # probably quite a lot larger than the expected 13. the test is
+        # essentially meaningless unless the uncompressed message was large
+        # enough… then we could test to see if the dq len was smaller than
+        # expected or something… let's just skip this for the compressed dq
+        assert len(dq) == 13
+
+    assert dq.peek() == ('one', {'testinator': 3})
+    assert dq.get() == ('one', {'testinator': 3})
+    assert dq.peek() == ('two', {'testinator': 4})
+
+    if not dq.compression:
+        assert len(dq) == 9
 
     assert dq.getz() == ('two three', {'testinator': 5})
     assert len(dq) == 0
@@ -37,18 +52,23 @@ def test_disk_queue(dq):
     assert dq.getz(8) == ('one two', {})
     assert dq.getz(8) == ('three', {})
 
+def test_disk_queue(dq):
+    _test_disk_queue(dq)
+
+def test_disk_queue_with_compression(dqc):
+    _test_disk_queue(dqc)
+
 def _test_pop(samp,q):
     for i in samp:
         q.put(i)
     for i in samp:
-        assert q.peek() == (str.encode(i), {})
+        assert q.peek() == (i, {})
         q.pop()
 
 def test_dq_pop(samp,dq):
     _test_pop(samp,dq)
 
-def test_disk_queue_put_estimator():
-    dq = DiskQueue(TEST_DQ_DIR, fresh=True)
+def test_disk_queue_put_estimator(dq):
     for item in ['hi-there-{}'.format(x) for x in range(20)]:
         pre = dq.cn, dq.sz
         dq.put(item)
diff --git a/tests/unittests/test_hec_obj.py b/tests/unittests/test_hec_obj.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+
+import os
+import json
+import mock
+from hubblestack.hec import HEC
+
+TEST_DQ_DIR = os.environ.get('TEST_DQ_DIR', '/tmp/dq.{0}'.format(os.getuid()))
+
+@mock.patch.object(HEC, '_send')
+def test_hec__send_trivially(mock_send):
+    hec = HEC('token', 'index', 'server')
+    hec.sendEvent({'test': 'test-tacular'})
+    assert json.loads(mock_send.call_args.args[0].dat)['test'] == 'test-tacular'
+
+@mock.patch.object(HEC, '_send') # just in case, not actually used
+def test_queue_things_with_compression(mock_send, __opts__, __salt__):
+    hec = HEC('token', 'index', 'server',
+        disk_queue=TEST_DQ_DIR, disk_queue_size=1000,
+        disk_queue_compression=9)
+
+    results_of_side_effect = list()
+    def side_effect(x):
+        results_of_side_effect.append(x)
+    mock_send.side_effect = side_effect
+
+    gz = list()
+    for i in range(100):
+        dat = {f'event{i}': f'test{i}'}
+        hec.queueEvent(dat)
+        gz.append( json.dumps(dat) )
+    hec.flushQueue()
+    assert ' '.join(results_of_side_effect) == ' '.join(gz)