aojea
diff --git a/‎.github/workflows/e2e.yml
+1-1 b/‎.github/workflows/e2e.yml
+1-1
diff --git a/‎README.md
+22 b/‎README.md
+22
diff --git a/‎cmd/main.go
+12-5 b/‎cmd/main.go
+12-5
diff --git a/‎docs/testing/README.md
+128 b/‎docs/testing/README.md
+128
diff --git a/‎docs/testing/backend.yaml
+35 b/‎docs/testing/backend.yaml
+35
diff --git a/‎docs/testing/job_poller.yaml
+14 b/‎docs/testing/job_poller.yaml
+14
@@ -127,7 +127,7 @@ jobs:
         /usr/local/bin/kubectl get nodes -o wide
         /usr/local/bin/kubectl get pods -A
         /usr/local/bin/kubectl wait --timeout=1m --for=condition=ready pods --namespace=kube-system -l k8s-app=kube-dns
-        /usr/local/bin/kubectl wait --timeout=1m --for=condition=ready pods --namespace=kube-system -l app=kube-netpol
+        /usr/local/bin/kubectl wait --timeout=1m --for=condition=ready pods --namespace=kube-system -l app=kube-network-policies
 
     - name: Run tests
       run: |
 
@@ -9,6 +9,28 @@ This project takes a different approach. It uses the NFQUEUE functionality imple
 There are some performance improvements that can be applied, such as to restrict in the dataplane the packets that are sent to userspace to the ones that have network policies only, so only
 the Pods affected by network policies will hit the first byte performance.
 
+## Metrics
+
+Prometheus metrics are exposed on the address defined by the flag
+
+```
+  -metrics-bind-address string
+        The IP address and port for the metrics server to serve on (default ":9080")
+```
+
+Current implemented metrics are:
+
+* packet_process_time: Time it has taken to process each packet (microseconds)
+* packet_process_duration_microseconds: A summary of the packet processing durations in microseconds
+* packet_count: Number of packets
+* nfqueue_queue_total: The number of packets currently queued and waiting to be processed by the application
+* nfqueue_queue_dropped: Number of packets that had to be dropped by the kernel because too many packets are already waiting for user space to send back the mandatory accept/drop verdicts
+* nfqueue_user_dropped: Number of packets that were dropped within the netlink subsystem. Such drops usually happen when the corresponding socket buffer is full; that is, user space is not able to read messages fast enough
+* nfqueue_packet_id: ID of the most recent packet queued
+
+## Testing
+
+See [.docs/testing/README.md] 
 
 ## References
 
 
@@ -4,6 +4,7 @@ import (
 	"context"
 	"flag"
 	"fmt"
+	"net"
 	"net/http"
 	"os"
 	"os/signal"
@@ -20,13 +21,15 @@ import (
 )
 
 var (
-	failOpen bool
-	queueID  int
+	failOpen           bool
+	queueID            int
+	metricsBindAddress string
 )
 
 func init() {
-	flag.BoolVar(&failOpen, "fail-open", false, "If set, don't drop packets if the controller is not running (default false)")
-	flag.IntVar(&queueID, "nfqueue-id", 100, "Number of the nfqueue used (default 100)")
+	flag.BoolVar(&failOpen, "fail-open", false, "If set, don't drop packets if the controller is not running")
+	flag.IntVar(&queueID, "nfqueue-id", 100, "Number of the nfqueue used")
+	flag.StringVar(&metricsBindAddress, "metrics-bind-address", ":9080", "The IP address and port for the metrics server to serve on")
 
 	flag.Usage = func() {
 		fmt.Fprint(os.Stderr, "Usage: kube-netpol [options]\n\n")
@@ -39,6 +42,10 @@ func main() {
 	klog.InitFlags(nil)
 	flag.Parse()
 	//
+	if _, _, err := net.SplitHostPort(metricsBindAddress); err != nil {
+		klog.Fatalf("error parsing metrics bind address %s : %v", metricsBindAddress, err)
+	}
+
 	cfg := networkpolicy.Config{
 		FailOpen: failOpen,
 		QueueID:  queueID,
@@ -69,7 +76,7 @@ func main() {
 	informersFactory := informers.NewSharedInformerFactory(clientset, 0)
 
 	http.Handle("/metrics", promhttp.Handler())
-	go http.ListenAndServe(":9080", nil)
+	go http.ListenAndServe(metricsBindAddress, nil)
 
 	networkPolicyController := networkpolicy.NewController(
 		clientset,
 
@@ -0,0 +1,128 @@
+# Testing
+
+This is an example of how to do some microbenchmarking.
+
+1. Collect the existing metrics from the agents
+   
+Example [deployment with prometheus](./monitoring.yaml)
+
+2. Deploy some Pods running an http server behind a Service
+
+Since network policies work for the first packet in the connection we need to generate new connections:
+* We can not use HTTP keepalives or HTTP2 or protocols that multiplex request over the same connection
+* A pair of endpoints will be limited by the number of ephemeral ports in the origin, since the destination IP and Port will be fixed 
+
+```
+cat /proc/sys/net/ipv4/ip_local_port_range
+32768   60999
+```
+
+3. Run a [Job that polls the Service created previously](job_poller.yaml)
+
+Each Pod runs request in parallel
+
+```
+ kubectl logs abtest-t7wjd
+This is ApacheBench, Version 2.3 <$Revision: 1913912 $>
+Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/
+Licensed to The Apache Software Foundation, http://www.apache.org/
+
+Benchmarking test-service (be patient)
+Completed 1000 requests
+Completed 2000 requests
+Completed 3000 requests
+Completed 4000 requests
+Completed 5000 requests
+Completed 6000 requests
+Completed 7000 requests
+Completed 8000 requests
+Completed 9000 requests
+Completed 10000 requests
+Finished 10000 requests
+
+
+Server Software:
+Server Hostname:        test-service
+Server Port:            80
+
+Document Path:          /
+Document Length:        60 bytes
+
+Concurrency Level:      1000
+Time taken for tests:   4.317 seconds
+Complete requests:      10000
+Failed requests:        1274
+   (Connect: 0, Receive: 0, Length: 1274, Exceptions: 0)
+Total transferred:      1768597 bytes
+HTML transferred:       598597 bytes
+Requests per second:    2316.61 [#/sec] (mean)
+Time per request:       431.666 [ms] (mean)
+Time per request:       0.432 [ms] (mean, across all concurrent requests)
+Transfer rate:          400.11 [Kbytes/sec] received
+
+Connection Times (ms)
+              min  mean[+/-sd] median   max
+Connect:        0  188 571.9      4    4121
+Processing:     0    2   5.3      0      42
+Waiting:        0    1   2.8      0      32
+Total:          0  190 571.8      5    4122
+
+Percentage of the requests served within a certain time (ms)
+  50%      5
+  66%      7
+  75%     22
+  80%     24
+  90%   1023
+  95%   1046
+  98%   2063
+  99%   3080
+ 100%   4122 (longest request)
+ ```
+
+ You have to tune your system as it is most likely you reach limits in some of the different resources, specially in the conntrack table
+
+ ```
+ [1825525.815672] net_ratelimit: 411 callbacks suppressed
+[1825525.815676] nf_conntrack: nf_conntrack: table full, dropping packet
+[1825525.827617] nf_conntrack: nf_conntrack: table full, dropping packet
+[1825525.834317] nf_conntrack: nf_conntrack: table full, dropping packet
+[1825525.841058] nf_conntrack: nf_conntrack: table full, dropping packet
+[1825525.847764] nf_conntrack: nf_conntrack: table full, dropping packet
+[1825525.854458] nf_conntrack: nf_conntrack: table full, dropping packet
+[1825525.861131] nf_conntrack: nf_conntrack: table full, dropping packet
+[1825525.867814] nf_conntrack: nf_conntrack: table full, dropping packet
+[1825525.874505] nf_conntrack: nf_conntrack: table full, dropping packet
+[1825525.881186] nf_conntrack: nf_conntrack: table full, dropping packet
+```
+
+Check the current max number of conntrack entries allowed and tune accordenly
+
+```
+ cat /proc/sys/net/netfilter/nf_conntrack_max
+262144
+```
+
+
+4. Observe the metrics in prometheus or graphana
+
+
+![Packet Processing Latency](network_policies_latency.png "Packet Processing Latency")
+![Packet Rate](network_policies_packet_rate.png "Packet Rate")
+
+
+## Future work
+
+We are interested in understanding the following variables
+
+* Memory and CPU consumption
+* Latency on packet processing
+* Latency to apply a network policy since it has been created
+
+This can microbencharked easily, using one Node or a Kind cluster and adding fake nodes and pods https://developer.ibm.com/tutorials/awb-using-kwok-to-simulate-a-large-kubernetes-openshift-cluster/ and running scenarios in just one node with the different variables
+
+
+Inputs:
+
+* New connections per seconds
+* Number of Pods on the cluster (affected or not affected by network policies)
+* Number of Network Policies impacting the connections
@@ -0,0 +1,35 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: server-deployment
+  labels:
+    app: MyApp
+spec:
+  replicas: 10
+  selector:
+    matchLabels:
+      app: MyApp
+  template:
+    metadata:
+      labels:
+        app: MyApp
+    spec:
+      containers:
+      - name: agnhost
+        image: k8s.gcr.io/e2e-test-images/agnhost:2.39
+        args:
+          - netexec
+          - --http-port=80
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: test-service
+spec:
+  type: ClusterIP
+  selector:
+    app: MyApp
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 80
@@ -0,0 +1,14 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: abtest
+spec:
+  completions: 50
+  parallelism: 10
+  template:
+    spec:
+      containers:
+      - name: ab
+        image: httpd:2
+        command: ["ab", "-n", "10000", "-c", "1000", "-v", "1", "http://test-service:80/"]
+      restartPolicy: Never