Added some fixes, Session definition and tests, updated docs

RamiAwar · RamiAwar · commit dae36704960d · 2022-05-07T12:05:28.000+02:00
diff --git a/.isort.cfg b/.isort.cfg
@@ -1,3 +1,3 @@
 [settings]
 profile=black
-include_trailing_comma = true
+include_trailing_comma = true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -26,3 +26,10 @@ repos:
       entry: |
         make check-codestyle
       language: system
+
+  - repo: local
+    hooks:
+    - id: todo-checker
+      name: todo-checker
+      entry: todo_checker.sh
+      language: script
diff --git a/README.md b/README.md
@@ -69,7 +69,8 @@ connect(hosts="localhost:9200")
 ```python
 # Create and save doc
 user = User(name="John", age=20)
-user.save(wait_for=True)
+user.save(wait_for=True)  # wait_for explained below
+
 assert user.id != None
 
 # Update doc
@@ -91,6 +92,24 @@ user.save(wait_for=True)
 user.delete(wait_for=True)
 ```
 
+### Sessions
+Sessions are inspired by [SQL Alchemy](https://docs.sqlalchemy.org/en/14/orm/tutorial.html)'s sessions, and are used for simplifying bulk operations using the Elasticsearch client. From what I've seen, the ES client makes it pretty hard to use the bulk API, so they created bulk helpers (which in turn have incomplete/wrong docs).
+
+With an ORM, bulk operations can be exposed neatly through a simple API.
+```python
+john = User(name="John")
+sarah = User(name="Sarah")
+
+session = Session()
+
+session.save(john)
+session.save(sarah)
+session.commit()
+```
+
+The sessions API will also be available through a context manager before the v1.0 release.
+
+
 ### Dynamic Index Support
 Pydastic also supports dynamic index specification. The model Metaclass index definition is still mandatory, but if an index is specified when performing operations, that will be used instead.
 The model Metaclass index is technically a fallback, although most users will probably be using a single index per model. For some users, multiple indices per model are needed (for example one user index per company).
@@ -102,6 +121,16 @@ user.save(index="my-user", wait_for=True)
 user.delete(index="my-user", wait_for=True)
 ```
 
+
+### Notes on testing
+When writing tests with Pydastic (even applies when writing tests with the elasticsearch client), remember to use the `wait_for=True` argument when executing operations. If this is not used, then the test will continue executing even if Elasticsearch hasn't propagated the change to all nodes, giving you weird results.
+
+For example if you save a document, then try getting it directly after, you'll get a document not found error. This is solved by using the wait_for argument in Pydastic (equivalent to `refresh="wait_for"` in Elasticsearch)
+
+Here is [a reference](https://elasticsearch-py.readthedocs.io/en/v8.2.0/api.html#elasticsearch.Elasticsearch.index) to where this argument is listed in the docs. 
+
+It's also supported in the bulk helpers even though its not mentioned in their docs, but you wouldn't figure that out unless you dug into their source and traced back several function calls where `*args` `**kwargs` are just being forwarded across calls.. :)
+
 ## Support Elasticsearch Versions
 
 Part of the build flow is running the tests using elasticsearch 7.12.0 DB as well as python client, and using 8.1.2 as well (DB as well as client, as part of a build matrix).
diff --git a/pydastic/__init__.py b/pydastic/__init__.py
@@ -18,8 +18,21 @@ def get_version() -> str:
 
 version: str = get_version()
 
-from pydastic.error import NotFoundError
+from pydastic.error import (
+    InvalidElasticsearchResponse,
+    InvalidModelError,
+    NotFoundError,
+)
 from pydastic.model import ESModel
 from pydastic.pydastic import PydasticClient, connect
-
-__all__ = ["ESModel", "NotFoundError", "PydasticClient", "connect"]
+from pydastic.session import Session
+
+__all__ = [
+    "ESModel",
+    "Session",
+    "NotFoundError",
+    "InvalidModelError",
+    "InvalidElasticsearchResponse",
+    "PydasticClient",
+    "connect",
+]
diff --git a/pydastic/error.py b/pydastic/error.py
@@ -8,3 +8,7 @@ class IndexDoesNotFoundError(Exception):
 
 class InvalidElasticsearchResponse(Exception):
     ...
+
+
+class InvalidModelError(Exception):
+    ...
diff --git a/pydastic/model.py b/pydastic/model.py
@@ -3,8 +3,9 @@
 from typing import Any, Callable, Dict, Optional, Tuple, Type, TypeVar, Union
 
 from elasticsearch import NotFoundError as ElasticNotFoundError
-from pydantic import BaseModel
-from pydantic.main import Field, FieldInfo, ModelMetaclass
+from pydantic import BaseModel, Field
+from pydantic.fields import FieldInfo
+from pydantic.main import ModelMetaclass
 
 from pydastic.error import InvalidElasticsearchResponse, NotFoundError
 from pydastic.pydastic import _client
@@ -187,8 +188,6 @@ def delete(self: Type[M], index: Optional[str] = None, wait_for: Optional[bool]
         if not self.id:
             raise ValueError("id missing from object")
 
-        doc = self.dict(exclude={"id"})
-
         # Allow waiting for shards - useful when testing
         refresh = "false"
         if wait_for:
@@ -199,6 +198,6 @@ def delete(self: Type[M], index: Optional[str] = None, wait_for: Optional[bool]
             index = self.Meta.index
 
         try:
-            res = _client.client.delete(index=index, id=self.id, refresh=refresh)
+            _client.client.delete(index=index, id=self.id, refresh=refresh)
         except ElasticNotFoundError:
             raise NotFoundError(f"document with id {id} not found")
diff --git a/pydastic/session.py b/pydastic/session.py
@@ -0,0 +1,48 @@
+from typing import Optional
+
+from elasticsearch.helpers import bulk
+
+from pydastic.error import InvalidModelError
+from pydastic.model import ESModel
+from pydastic.pydastic import _client
+
+
+class Session:
+    def __init__(self):
+        # Initialize state
+        self._operations = []
+
+    def save(self, model: ESModel, index: Optional[str] = None):
+        # Create save bulk operation
+        if not index:
+            index = model.Meta.index
+
+        doc = model.dict(exclude={"id"})
+        op = {"_index": index, "_op_type": "index", **doc}
+
+        self._operations.append(op)
+
+    def update(self, model: ESModel, index: Optional[str] = None):
+        if not index:
+            index = model.Meta.index
+
+        if not model.id:
+            raise InvalidModelError("model id property is required for update operations")
+
+        doc = model.dict(exclude={"id"})
+        op = {"_id": model.id, "_index": index, "_op_type": "update", "_source": {"doc": doc}}
+
+        self._operations.append(op)
+
+    def commit(self, wait_for: Optional[bool] = False):
+        refresh = "false"
+        if wait_for:
+            refresh = "wait_for"
+
+        results = bulk(client=_client.client, actions=self._operations, refresh=refresh)
+
+        # TODO: Process errors from operations
+        pass
+
+    def delete(self, model: ESModel, index: Optional[str] = None):
+        raise NotImplementedError
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "pydastic"
-version = "0.2.2"
+version = "0.3.0"
 description = "Pydastic is an elasticsearch python ORM based on Pydantic."
 readme = "README.md"
 authors = ["pydastic <rami.awar.ra@gmail.com>"]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,7 +7,8 @@
 
 @pytest.fixture()
 def es() -> Elasticsearch:
-    connect(hosts="http://localhost:9200", ssl_show_warn=False)
+    connect(hosts="http://localhost:9200")
+    _client.client.delete_by_query(index="_all", body={"query": {"match_all": {}}}, wait_for_completion=True, refresh=True)
     return _client.client
 
 
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -112,6 +112,9 @@ def test_model_save_additional_fields(es: Elasticsearch):
     assert dict(user_dict, **extra_fields) == user_dict
 
 
+# TODO: Test partial updates (to dict, exclude = True) -> maybe provide a partial api?
+
+
 def test_model_ignores_additional_fields(es: Elasticsearch):
     extra_fields = {"name": "John", "location": "Seattle", "manager_ids": ["Pam", "Sam"]}
     res = es.index(index=User.Meta.index, body=extra_fields)
diff --git a/tests/test_session.py b/tests/test_session.py
@@ -0,0 +1,32 @@
+import pytest
+from elasticsearch import Elasticsearch
+from user import User
+
+from pydastic import ESModel, Session
+
+
+def test_session_save(es: Elasticsearch):
+    user = User(name="John")
+
+    session = Session()
+
+    session.save(user)
+    session.commit(wait_for=True)
+
+    res = es.search(index=user.Meta.index, body={"query": {"match_all": {}}})
+    assert len(res["hits"]["hits"]) == 1
+
+    model = user.to_es()
+    assert res["hits"]["hits"][0]["_source"] == model
+
+
+def test_session_save_with(es: Elasticsearch):
+    ...
+
+
+def test_session_save_with_bulk_error(es: Elasticsearch):
+    ...
+
+
+def test_session_update(es: Elasticsearch):
+    ...
diff --git a/todo_checker.sh b/todo_checker.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+check_file() {
+    local file=$1
+    local match_pattern=$2
+
+    local file_changes_with_context=$(git diff -U999999999 -p --cached --color=always -- $file)
+
+    # From the diff, get the green lines starting with '+' and including '$match_pattern'
+    local matched_additions=$(echo "$file_changes_with_context" | grep -C4 $'^\e\\[32m\+.*'"$match_pattern")
+
+    if [ -n "$matched_additions" ]; then
+        echo -e "\n$file additions match '$match_pattern':\n"
+
+        for matched_line in $matched_additions
+        do
+            echo "$matched_line"
+        done
+
+        echo "Not committing, because $file matches $match_pattern"
+        exit 1
+    fi
+}
+
+# Actual hook logic:
+
+MATCH='TODO'
+for file in `git diff --cached -p --name-status | cut -c3-`; do
+    for match_pattern in $MATCH
+    do
+        check_file $file $match_pattern
+    done
+done
+exit