From 6adfaa8342f0057fcb908443535e87978f34ff32 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Sat, 28 Oct 2017 17:51:28 +0200
Subject: [PATCH 01/41] Started on #359

---
 libethash-cuda/CUDAMiner.cpp               | 100 ++++++++++++---------
 libethash-cuda/CUDAMiner.h                 |   1 +
 libethash-cuda/ethash_cuda_miner.cpp       |  27 ++++--
 libethash-cuda/ethash_cuda_miner.h         |   1 +
 libethash-cuda/ethash_cuda_miner_kernel.cu |   2 +-
 5 files changed, 77 insertions(+), 54 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index f870615999..75ac9cde76 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -116,73 +116,85 @@ void CUDAMiner::kickOff()
 	startWorking();
 }
 
-void CUDAMiner::workLoop()
+void CUDAMiner::initDevice(WorkPackage w)
 {
 	// take local copy of work since it may end up being overwritten by kickOff/pause.
 	try {
-		WorkPackage w = work();
-		if (!w)
-			return;
-
-		cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
-		if (!m_miner || m_minerSeed != w.seed)
+		unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
+		if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
 		{
-			unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
-
-			if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
+			while (s_dagLoadIndex < index) {
+				this_thread::sleep_for(chrono::milliseconds(100)); //changed this from 1 second
+			}
+		}
+		else if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+		{
+			if (device != s_dagCreateDevice)
 			{
-				while (s_dagLoadIndex < index) {
-					this_thread::sleep_for(chrono::seconds(1));
+				// wait until DAG is created on selected device
+				while (s_dagInHostMemory == NULL) {
+					this_thread::sleep_for(chrono::milliseconds(100));//changed this from 1 second
 				}
 			}
-			else if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+			else
 			{
-				if (device != s_dagCreateDevice)
-				{
-					// wait until DAG is created on selected device
-					while (s_dagInHostMemory == NULL) {
-						this_thread::sleep_for(chrono::seconds(1));
-					}
-				}
-				else
-				{
-					// reset load index
-					s_dagLoadIndex = 0;
-				}
+				// reset load index
+				s_dagLoadIndex = 0;
 			}
+		}
 
-			cnote << "Initialising miner...";
-			m_minerSeed = w.seed;
+		cnote << "Initialising miner...";
+		m_minerSeed = w.seed;
 
-			delete m_miner;
-			m_miner = new ethash_cuda_miner;
+		delete m_miner;
+		m_miner = new ethash_cuda_miner;
 
-			EthashAux::LightType light;
-			light = EthashAux::light(w.seed);
-			//bytesConstRef dagData = dag->data();
-			bytesConstRef lightData = light->data();
+		EthashAux::LightType light;
+		light = EthashAux::light(w.seed);
+		//bytesConstRef dagData = dag->data();
+		bytesConstRef lightData = light->data();
 
-			m_miner->init(light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory);
-			s_dagLoadIndex++;
+		m_miner->init(light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory);
+		s_dagLoadIndex++;
 
-			if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+		if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+		{
+			if (s_dagLoadIndex >= s_numInstances && s_dagInHostMemory)
 			{
-				if (s_dagLoadIndex >= s_numInstances && s_dagInHostMemory)
-				{
-					// all devices have loaded DAG, we can free now
-					delete[] s_dagInHostMemory;
-					s_dagInHostMemory = NULL;
-
-					cout << "Freeing DAG from host" << endl;
-				}
+				// all devices have loaded DAG, we can free now
+				delete[] s_dagInHostMemory;
+				s_dagInHostMemory = NULL;
+				cout << "Freeing DAG from host" << endl;
 			}
 		}
 
+	}
+	catch (std::runtime_error const& _e)
+	{
+		delete m_miner;
+		m_miner = nullptr;
+		cwarn << "Error CUDA mining: " << _e.what();
+	}
+}
+
+void CUDAMiner::workLoop()
+{
+	try
+	{
+		WorkPackage w = work();
+		if(!w)
+			return;
+		cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
+		if (!m_miner || m_minerSeed != w.seed)
+		{
+			initDevice(w);
+		}
 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
 		uint64_t startN = w.startNonce;
 		if (w.exSizeBits >= 0)
 			startN = w.startNonce | ((uint64_t)index << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
 		m_miner->search(w.header.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
+
 	}
 	catch (std::runtime_error const& _e)
 	{
diff --git a/libethash-cuda/CUDAMiner.h b/libethash-cuda/CUDAMiner.h
index 82226cc2a3..5157e2297e 100644
--- a/libethash-cuda/CUDAMiner.h
+++ b/libethash-cuda/CUDAMiner.h
@@ -76,6 +76,7 @@ class EthashCUDAHook;
 	private:
 		void workLoop() override;
 		void report(uint64_t _nonce);
+		void initDevice(WorkPackage w);
 
 		EthashCUDAHook* m_hook = nullptr;
 		ethash_cuda_miner* m_miner = nullptr;
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index d2d7fe45b2..d36075eacc 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -205,24 +205,19 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 {
 	try
 	{
-		int device_count = getNumDevices();
+		unsigned device_count = getNumDevices();
 
 		if (device_count == 0)
 			return false;
 
 		// use selected device
-		int device_num = std::min<int>((int)_deviceId, device_count - 1);
+		int device_num = _deviceId < device_count -1 ? _deviceId : device_count - 1;
 
 		cudaDeviceProp device_props;
 		CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));
 
 		cudalog << "Using device: " << device_props.name << " (Compute " + to_string(device_props.major) + "." + to_string(device_props.minor) + ")";
 
-		CUDA_SAFE_CALL(cudaSetDevice(device_num));
-		CUDA_SAFE_CALL(cudaDeviceReset());
-		CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
-		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
-
 		m_search_buf = new volatile uint32_t *[s_numStreams];
 		m_streams = new cudaStream_t[s_numStreams];
 
@@ -230,13 +225,27 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		uint32_t dagSize128   = (unsigned)(dagSize / ETHASH_MIX_BYTES);
 		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
 
+		if(dagSize != m_current_dagSize)
+		{
+			//We need to reset the device
+		}else
+		{
+			//We only need to reset the light 
+		}
+		CUDA_SAFE_CALL(cudaSetDevice(device_num));
+		CUDA_SAFE_CALL(cudaDeviceReset());
+		CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
+		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+
+
+
 		// create buffer for cache
 		hash64_t * light = NULL;
 
 		if (!*hostDAG)
 		{
 			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
-			// copy dag cache to CPU.
+			// copy lightData to device
 			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
 		}
 
@@ -278,7 +287,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			const void* hdag = (const void*)(*hostDAG);
 			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
 		}
-
+		m_current_dagSize = dagSize;
 		return true;
 	}
 	catch (runtime_error const&)
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 65134cff43..170ac3f7e6 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -54,6 +54,7 @@ class ethash_cuda_miner
 	uint64_t m_current_nonce;
 	uint64_t m_starting_nonce;
 	uint64_t m_current_index;
+	uint64_t m_current_dagSize;
 
 	uint32_t m_sharedBytes;
 
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index d25cc14abe..1fcffbab29 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -100,7 +100,7 @@ ethash_calculate_dag_item(uint32_t start)
 #else
 				uint4 s4 = make_uint4(__shfl_sync(0xFFFFFFFF,p4.x, w, 4), __shfl_sync(0xFFFFFFFF,p4.y, w, 4), __shfl_sync(0xFFFFFFFF,p4.z, w, 4), __shfl_sync(0xFFFFFFFF,p4.w, w, 4));
 #endif
-				if (t == thread_id) {
+				if (t == thread_id) { //potential performance leak (branching in warp)
 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
 				}
 			}

From a816f1c9044a37e8493a27986ab1d8164fcb90a8 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 23 Nov 2017 12:10:49 +0100
Subject: [PATCH 02/41] Rewrote CUDAMiner

---
 libethash-cuda/CUDAMiner.cpp               | 47 ++++++++++++++--------
 libethash-cuda/CUDAMiner.h                 |  2 +
 libethash-cuda/ethash_cuda_miner.cpp       |  2 +-
 libethash-cuda/ethash_cuda_miner_kernel.cu |  5 ++-
 4 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 75ac9cde76..8b2c76a5ce 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -116,7 +116,7 @@ void CUDAMiner::kickOff()
 	startWorking();
 }
 
-void CUDAMiner::initDevice(WorkPackage w)
+bool CUDAMiner::init(const h256& seed)
 {
 	// take local copy of work since it may end up being overwritten by kickOff/pause.
 	try {
@@ -144,13 +144,13 @@ void CUDAMiner::initDevice(WorkPackage w)
 		}
 
 		cnote << "Initialising miner...";
-		m_minerSeed = w.seed;
+		m_minerSeed = seed;
 
 		delete m_miner;
 		m_miner = new ethash_cuda_miner;
 
 		EthashAux::LightType light;
-		light = EthashAux::light(w.seed);
+		light = EthashAux::light(seed);
 		//bytesConstRef dagData = dag->data();
 		bytesConstRef lightData = light->data();
 
@@ -167,34 +167,49 @@ void CUDAMiner::initDevice(WorkPackage w)
 				cout << "Freeing DAG from host" << endl;
 			}
 		}
-
+		return true;
 	}
 	catch (std::runtime_error const& _e)
 	{
 		delete m_miner;
 		m_miner = nullptr;
 		cwarn << "Error CUDA mining: " << _e.what();
+		return false;
 	}
 }
 
 void CUDAMiner::workLoop()
 {
+	WorkPackage current;
+	current.header = h256{1u};
+	current.seed = h256{1u};
 	try
 	{
-		WorkPackage w = work();
-		if(!w)
-			return;
-		cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
-		if (!m_miner || m_minerSeed != w.seed)
+		while(true)
 		{
-			initDevice(w);
+			WorkPackage w = work();
+			if(current.header != w.header)
+			{
+				if(!w)
+				{
+					cnote << "No work. Pause for 3 s.";
+					std::this_thread::sleep_for(std::chrono::seconds(3));
+					continue;
+				}
+				
+				cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
+				if (current.seed != w.seed || !m_miner)
+				{
+					init(w.seed);
+				}
+			}
+			
+			uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
+			uint64_t startN = w.startNonce;
+			if (w.exSizeBits >= 0)
+				startN = w.startNonce | ((uint64_t)index << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
+			m_miner->search(w.header.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
 		}
-		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
-		uint64_t startN = w.startNonce;
-		if (w.exSizeBits >= 0)
-			startN = w.startNonce | ((uint64_t)index << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
-		m_miner->search(w.header.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
-
 	}
 	catch (std::runtime_error const& _e)
 	{
diff --git a/libethash-cuda/CUDAMiner.h b/libethash-cuda/CUDAMiner.h
index 5157e2297e..ce4a3255e8 100644
--- a/libethash-cuda/CUDAMiner.h
+++ b/libethash-cuda/CUDAMiner.h
@@ -78,6 +78,8 @@ class EthashCUDAHook;
 		void report(uint64_t _nonce);
 		void initDevice(WorkPackage w);
 
+		bool init(const h256& seed);
+
 		EthashCUDAHook* m_hook = nullptr;
 		ethash_cuda_miner* m_miner = nullptr;
 
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index d36075eacc..2da902ed30 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -259,7 +259,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
 			CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
 		}
-		set_constants(dag, dagSize128, light, lightSize64);
+		set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
 		memset(&m_current_header, 0, sizeof(hash32_t));
 		m_current_target = 0;
 		m_current_nonce = 0;
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 1fcffbab29..5ae6f89649 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -100,12 +100,13 @@ ethash_calculate_dag_item(uint32_t start)
 #else
 				uint4 s4 = make_uint4(__shfl_sync(0xFFFFFFFF,p4.x, w, 4), __shfl_sync(0xFFFFFFFF,p4.y, w, 4), __shfl_sync(0xFFFFFFFF,p4.z, w, 4), __shfl_sync(0xFFFFFFFF,p4.w, w, 4));
 #endif
-				if (t == thread_id) { //potential performance leak (branching in warp)
+				if (t == thread_id) {
 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
 				}
 			}
-
 		}
+
+
 #endif		
 	}
 	SHA3_512(dag_node.uint2s);

From 1be14863435fce80b04233e6fc1ca56977f9b576 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Mon, 27 Nov 2017 12:03:55 +0100
Subject: [PATCH 03/41] minor fix

---
 libethash-cuda/CUDAMiner.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 8b2c76a5ce..e4a117c187 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -187,7 +187,8 @@ void CUDAMiner::workLoop()
 	{
 		while(true)
 		{
-			WorkPackage w = work();
+			const WorkPackage w = work();
+			
 			if(current.header != w.header)
 			{
 				if(!w)
@@ -203,7 +204,7 @@ void CUDAMiner::workLoop()
 					init(w.seed);
 				}
 			}
-			
+			current = w;
 			uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
 			uint64_t startN = w.startNonce;
 			if (w.exSizeBits >= 0)

From 42bd2872db750978168c13a717707091c1b8b793 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Sat, 2 Dec 2017 22:38:04 +0100
Subject: [PATCH 04/41] rewrote the kernel once again

---
 libethash-cuda/CUDAMiner.cpp         | 23 ++++---
 libethash-cuda/ethash_cuda_miner.cpp | 93 ++++++++++++++--------------
 libethash-cuda/ethash_cuda_miner.h   |  5 +-
 3 files changed, 66 insertions(+), 55 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index e4a117c187..0abbcb92e0 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -17,6 +17,8 @@ along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
 /** @file CUDAMiner.cpp
 * @author Gav Wood <i@gavwood.com>
 * @date 2014
+* @author MariusVanDerWijden
+* @date 2017
 *
 * Determines the PoW algorithm.
 */
@@ -112,8 +114,8 @@ void CUDAMiner::report(uint64_t _nonce)
 
 void CUDAMiner::kickOff()
 {
-	m_hook->reset();
-	startWorking();
+	//m_hook->reset();
+	//startWorking();
 }
 
 bool CUDAMiner::init(const h256& seed)
@@ -146,8 +148,10 @@ bool CUDAMiner::init(const h256& seed)
 		cnote << "Initialising miner...";
 		m_minerSeed = seed;
 
-		delete m_miner;
-		m_miner = new ethash_cuda_miner;
+		//delete m_miner;
+		//m_miner = new ethash_cuda_miner;
+		if(!m_miner)
+			m_miner = new ethash_cuda_miner;
 
 		EthashAux::LightType light;
 		light = EthashAux::light(seed);
@@ -189,7 +193,7 @@ void CUDAMiner::workLoop()
 		{
 			const WorkPackage w = work();
 			
-			if(current.header != w.header)
+			if(!m_miner || current.header != w.header)
 			{
 				if(!w)
 				{
@@ -199,9 +203,10 @@ void CUDAMiner::workLoop()
 				}
 				
 				cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
-				if (current.seed != w.seed || !m_miner)
+				if (!m_miner || current.seed != w.seed)
 				{
-					init(w.seed);
+					if(!init(w.seed))
+						break;
 				}
 			}
 			current = w;
@@ -222,8 +227,8 @@ void CUDAMiner::workLoop()
 
 void CUDAMiner::pause()
 {
-	m_hook->abort();
-	stopWorking();
+	//m_hook->abort();
+	//stopWorking();
 }
 
 std::string CUDAMiner::platformInfo()
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 2da902ed30..28a0182f89 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -17,6 +17,8 @@
 /** @file ethash_cuda_miner.cpp
 * @author Genoil <jw@meneer.net>
 * @date 2015
+* @coauthor MariusVanDerWijden
+* @date 2017
 */
 
 
@@ -225,20 +227,6 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		uint32_t dagSize128   = (unsigned)(dagSize / ETHASH_MIX_BYTES);
 		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
 
-		if(dagSize != m_current_dagSize)
-		{
-			//We need to reset the device
-		}else
-		{
-			//We only need to reset the light 
-		}
-		CUDA_SAFE_CALL(cudaSetDevice(device_num));
-		CUDA_SAFE_CALL(cudaDeviceReset());
-		CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
-		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
-
-
-
 		// create buffer for cache
 		hash64_t * light = NULL;
 
@@ -248,46 +236,61 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			// copy lightData to device
 			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
 		}
-
-		// create buffer for dag
-		hash128_t * dag;
-		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
 		
-		// create mining buffers
-		for (unsigned i = 0; i != s_numStreams; ++i)
+		
+		hash128_t * dag = m_dag;
+		if(dagSize128 != m_dag_size || !dag)
 		{
-			CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
-			CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
-		}
-		set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
-		memset(&m_current_header, 0, sizeof(hash32_t));
-		m_current_target = 0;
-		m_current_nonce = 0;
-		m_current_index = 0;
-
-		m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
+			//We need to reset the device and recreate the dag
+			CUDA_SAFE_CALL(cudaSetDevice(device_num));
+			CUDA_SAFE_CALL(cudaDeviceReset());
+			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
+			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+			
+			// create buffer for dag
+			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
+			
+			// create mining buffers
+			for (unsigned i = 0; i != s_numStreams; ++i)
+			{
+				CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
+				CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
+			}
+			
+			memset(&m_current_header, 0, sizeof(hash32_t));
+			m_current_target = 0;
+			m_current_nonce = 0;
+			m_current_index = 0;
 
-		if (!*hostDAG)
-		{
-			cudalog << "Generating DAG for GPU #" << device_num;
-			ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], device_num);
+			m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
 
-			if (_cpyToHost)
+			if (!*hostDAG)
 			{
-				uint8_t* memoryDAG = new uint8_t[dagSize];
-				cudalog << "Copying DAG from GPU #" << device_num << " to host";
-				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
+				cudalog << "Generating DAG for GPU #" << device_num;
+				ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], device_num);
+
+				if (_cpyToHost)
+				{
+					uint8_t* memoryDAG = new uint8_t[dagSize];
+					cudalog << "Copying DAG from GPU #" << device_num << " to host";
+					CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
 
-				*hostDAG = (void*)memoryDAG;
+					*hostDAG = (void*)memoryDAG;
+				}
 			}
-		}
-		else
+			else
+			{
+				cudalog << "Copying DAG from host to GPU #" << device_num;
+				const void* hdag = (const void*)(*hostDAG);
+				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
+			}
+		}else
 		{
-			cudalog << "Copying DAG from host to GPU #" << device_num;
-			const void* hdag = (const void*)(*hostDAG);
-			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
+			//We only need to reset the light 
 		}
-		m_current_dagSize = dagSize;
+		set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
+		m_dag = dag;
+		m_dag_size = dagSize128;
 		return true;
 	}
 	catch (runtime_error const&)
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 170ac3f7e6..c22760b64f 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -54,9 +54,12 @@ class ethash_cuda_miner
 	uint64_t m_current_nonce;
 	uint64_t m_starting_nonce;
 	uint64_t m_current_index;
-	uint64_t m_current_dagSize;
 
 	uint32_t m_sharedBytes;
+	
+	///Constants on GPU
+	hash128_t* m_dag;
+	uint32_t m_dag_size;
 
 	volatile uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;

From 138173d5b7f4cbe2944a2b0446fb1f0f923fd245 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Mon, 23 Oct 2017 13:30:11 +0200
Subject: [PATCH 05/41] added support for __shfl_sync since __shfl is
 deprecated with CUDA 9.0

---
 libethash-cuda/ethash_cuda_miner_kernel.cu        | 9 ++++++++-
 libethash-cuda/ethash_cuda_miner_kernel_globals.h | 3 ++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 6092f267a5..2f93648245 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -107,11 +107,18 @@ ethash_calculate_dag_item(uint32_t start)
 	}
 #else
 	for (uint32_t t = 0; t < 4; t++) {
-
+#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 		uint32_t shuffle_index = __shfl(node_index, t, 4);
+#else
+		uint32_t shuffle_index = __shfl_sync(node_index, t, 4);
+#endif
 		uint4 s[4];
 		for (uint32_t w = 0; w < 4; w++) {
+#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 			s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));
+#else
+			s[w] = make_uint4(__shfl_sync(dag_node.uint4s[w].x, t, 4), __shfl_sync(dag_node.uint4s[w].y, t, 4), __shfl_sync(dag_node.uint4s[w].z, t, 4), __shfl_sync(dag_node.uint4s[w].w, t, 4));
+#endif
 		}
 		dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];
 	}
diff --git a/libethash-cuda/ethash_cuda_miner_kernel_globals.h b/libethash-cuda/ethash_cuda_miner_kernel_globals.h
index 5688db1020..8f831e8d24 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel_globals.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel_globals.h
@@ -2,6 +2,7 @@
 #define _ETHASH_CUDA_MINER_KERNEL_GLOBALS_H_
 
 #define SHUFFLE_MIN_VER 300
+#define SHUFFLE_DEPRECATED 900
 
 //#include "cuda_helper.h"
 
@@ -12,4 +13,4 @@ __constant__ hash64_t* d_light;
 __constant__ hash32_t d_header;
 __constant__ uint64_t d_target;
 
-#endif
\ No newline at end of file
+#endif

From 8c7b88e80e4a63353e449c51860cda68559349d9 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Mon, 23 Oct 2017 20:42:01 +0200
Subject: [PATCH 06/41] added support for shfl_sync

---
 libethash-cuda/dagger_shuffled.cuh         | 26 ++++++++++++++++++++--
 libethash-cuda/ethash_cuda_miner_kernel.cu | 12 +++++++++-
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh
index 4497ad6995..984e797342 100644
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@@ -30,8 +30,13 @@ __device__ __forceinline__ uint64_t compute_hash(
 			uint2 shuffle[8];
 			for (int j = 0; j < 8; j++) 
 			{
+#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 				shuffle[j].x = __shfl(state[j].x, i+p, THREADS_PER_HASH);
 				shuffle[j].y = __shfl(state[j].y, i+p, THREADS_PER_HASH);
+#else
+				shuffle[j].x = __shfl_sync(state[j].x, i+p, THREADS_PER_HASH);
+				shuffle[j].y = __shfl_sync(state[j].y, i+p, THREADS_PER_HASH);
+#endif
 			}
 			switch (mix_idx)
 			{
@@ -40,7 +45,11 @@ __device__ __forceinline__ uint64_t compute_hash(
 				case 2: mix[p] = vectorize2(shuffle[4], shuffle[5]); break;
 				case 3: mix[p] = vectorize2(shuffle[6], shuffle[7]); break;
 			}
+#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 			init0[p] = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);
+#else
+			init0[p] = __shfl_sync(shuffle[0].x, 0, THREADS_PER_HASH);
+#endif
 		}
 
 		for (uint32_t a = 0; a < ACCESSES; a += 4)
@@ -52,7 +61,11 @@ __device__ __forceinline__ uint64_t compute_hash(
 				for (int p = 0; p < _PARALLEL_HASH; p++)
 				{
 					offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size;
+#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 					offset[p] = __shfl(offset[p], t, THREADS_PER_HASH);
+#else
+					offset[p] = __shfl_sync(offset[p], t, THREADS_PER_HASH);
+#endif
 				}
 				#pragma unroll
 				for (int p = 0; p < _PARALLEL_HASH; p++)
@@ -72,7 +85,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 			uint32_t thread_mix = fnv_reduce(mix[p]);
 
 			// update mix accross threads
-
+#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 			shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH);
 			shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH);
 			shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH);
@@ -81,7 +94,16 @@ __device__ __forceinline__ uint64_t compute_hash(
 			shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH);
 			shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH);
 			shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH);
-
+#else
+			shuffle[0].x = __shfl_sync(thread_mix, 0, THREADS_PER_HASH);
+			shuffle[0].y = __shfl_sync(thread_mix, 1, THREADS_PER_HASH);
+			shuffle[1].x = __shfl_sync(thread_mix, 2, THREADS_PER_HASH);
+			shuffle[1].y = __shfl_sync(thread_mix, 3, THREADS_PER_HASH);
+			shuffle[2].x = __shfl_sync(thread_mix, 4, THREADS_PER_HASH);
+			shuffle[2].y = __shfl_sync(thread_mix, 5, THREADS_PER_HASH);
+			shuffle[3].x = __shfl_sync(thread_mix, 6, THREADS_PER_HASH);
+			shuffle[3].y = __shfl_sync(thread_mix, 7, THREADS_PER_HASH);
+#endif
 			if ((i+p) == thread_id) {
 				//move mix into state:
 				state[8] = shuffle[0];
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 2f93648245..e6c32d4707 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -85,11 +85,21 @@ ethash_calculate_dag_item(uint32_t start)
 		}
 #else
 		for (uint32_t t = 0; t < 4; t++) {
+
+#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 			uint32_t shuffle_index = __shfl(parent_index, t, 4);
-			uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
+#else
+			uint32_t shuffle_index = __shfl_sync(parent_index, t, 4);
+#endif
 
+			uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
 			for (int w = 0; w < 4; w++) {
+
+#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 				uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
+#else
+				uint4 s4 = make_uint4(__shfl_sync(p4.x, w, 4), __shfl_sync(p4.y, w, 4), __shfl_sync(p4.z, w, 4), __shfl_sync(p4.w, w, 4));
+#endif
 				if (t == thread_id) {
 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
 				}

From 59a1227c3d6d3d82a92ae9a0dd371acbfb3aeefb Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Tue, 24 Oct 2017 14:45:41 +0200
Subject: [PATCH 07/41] added Bitmask to all shfl_sync operations, to target
 all warps

---
 libethash-cuda/dagger_shuffled.cuh         | 24 +++++++++++-----------
 libethash-cuda/ethash_cuda_miner_kernel.cu |  4 ++--
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh
index 984e797342..d9509663c9 100644
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@@ -34,8 +34,8 @@ __device__ __forceinline__ uint64_t compute_hash(
 				shuffle[j].x = __shfl(state[j].x, i+p, THREADS_PER_HASH);
 				shuffle[j].y = __shfl(state[j].y, i+p, THREADS_PER_HASH);
 #else
-				shuffle[j].x = __shfl_sync(state[j].x, i+p, THREADS_PER_HASH);
-				shuffle[j].y = __shfl_sync(state[j].y, i+p, THREADS_PER_HASH);
+				shuffle[j].x = __shfl_sync(0xFFFFFFFF,state[j].x, i+p, THREADS_PER_HASH);
+				shuffle[j].y = __shfl_sync(0xFFFFFFFF,state[j].y, i+p, THREADS_PER_HASH);
 #endif
 			}
 			switch (mix_idx)
@@ -48,7 +48,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 #if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 			init0[p] = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);
 #else
-			init0[p] = __shfl_sync(shuffle[0].x, 0, THREADS_PER_HASH);
+			init0[p] = __shfl_sync(0xFFFFFFFF,shuffle[0].x, 0, THREADS_PER_HASH);
 #endif
 		}
 
@@ -64,7 +64,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 #if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 					offset[p] = __shfl(offset[p], t, THREADS_PER_HASH);
 #else
-					offset[p] = __shfl_sync(offset[p], t, THREADS_PER_HASH);
+					offset[p] = __shfl_sync(0xFFFFFFFF,offset[p], t, THREADS_PER_HASH);
 #endif
 				}
 				#pragma unroll
@@ -95,14 +95,14 @@ __device__ __forceinline__ uint64_t compute_hash(
 			shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH);
 			shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH);
 #else
-			shuffle[0].x = __shfl_sync(thread_mix, 0, THREADS_PER_HASH);
-			shuffle[0].y = __shfl_sync(thread_mix, 1, THREADS_PER_HASH);
-			shuffle[1].x = __shfl_sync(thread_mix, 2, THREADS_PER_HASH);
-			shuffle[1].y = __shfl_sync(thread_mix, 3, THREADS_PER_HASH);
-			shuffle[2].x = __shfl_sync(thread_mix, 4, THREADS_PER_HASH);
-			shuffle[2].y = __shfl_sync(thread_mix, 5, THREADS_PER_HASH);
-			shuffle[3].x = __shfl_sync(thread_mix, 6, THREADS_PER_HASH);
-			shuffle[3].y = __shfl_sync(thread_mix, 7, THREADS_PER_HASH);
+			shuffle[0].x = __shfl_sync(0xFFFFFFFF,thread_mix, 0, THREADS_PER_HASH);
+			shuffle[0].y = __shfl_sync(0xFFFFFFFF,thread_mix, 1, THREADS_PER_HASH);
+			shuffle[1].x = __shfl_sync(0xFFFFFFFF,thread_mix, 2, THREADS_PER_HASH);
+			shuffle[1].y = __shfl_sync(0xFFFFFFFF,thread_mix, 3, THREADS_PER_HASH);
+			shuffle[2].x = __shfl_sync(0xFFFFFFFF,thread_mix, 4, THREADS_PER_HASH);
+			shuffle[2].y = __shfl_sync(0xFFFFFFFF,thread_mix, 5, THREADS_PER_HASH);
+			shuffle[3].x = __shfl_sync(0xFFFFFFFF,thread_mix, 6, THREADS_PER_HASH);
+			shuffle[3].y = __shfl_sync(0xFFFFFFFF,thread_mix, 7, THREADS_PER_HASH);
 #endif
 			if ((i+p) == thread_id) {
 				//move mix into state:
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index e6c32d4707..7ea5f7ed2e 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -89,7 +89,7 @@ ethash_calculate_dag_item(uint32_t start)
 #if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 			uint32_t shuffle_index = __shfl(parent_index, t, 4);
 #else
-			uint32_t shuffle_index = __shfl_sync(parent_index, t, 4);
+			uint32_t shuffle_index = __shfl_sync(0xFFFFFFFF,parent_index, t, 4);
 #endif
 
 			uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
@@ -98,7 +98,7 @@ ethash_calculate_dag_item(uint32_t start)
 #if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
 				uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
 #else
-				uint4 s4 = make_uint4(__shfl_sync(p4.x, w, 4), __shfl_sync(p4.y, w, 4), __shfl_sync(p4.z, w, 4), __shfl_sync(p4.w, w, 4));
+				uint4 s4 = make_uint4(__shfl_sync(0xFFFFFFFF,p4.x, w, 4), __shfl_sync(0xFFFFFFFF,p4.y, w, 4), __shfl_sync(0xFFFFFFFF,p4.z, w, 4), __shfl_sync(0xFFFFFFFF,p4.w, w, 4));
 #endif
 				if (t == thread_id) {
 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);

From 4b30cfb987ea32e5ac722c23084792604953009e Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Tue, 24 Oct 2017 15:42:50 +0200
Subject: [PATCH 08/41] Now Use CUDA_VERSION instead of __CUDA_ARCH__

---
 libethash-cuda/dagger_shuffled.cuh                | 8 ++++----
 libethash-cuda/ethash_cuda_miner_kernel.cu        | 8 ++++----
 libethash-cuda/ethash_cuda_miner_kernel_globals.h | 4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh
index d9509663c9..c1140dfddc 100644
--- a/libethash-cuda/dagger_shuffled.cuh
+++ b/libethash-cuda/dagger_shuffled.cuh
@@ -30,7 +30,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 			uint2 shuffle[8];
 			for (int j = 0; j < 8; j++) 
 			{
-#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
+#if CUDA_VERSION < SHUFFLE_DEPRECATED
 				shuffle[j].x = __shfl(state[j].x, i+p, THREADS_PER_HASH);
 				shuffle[j].y = __shfl(state[j].y, i+p, THREADS_PER_HASH);
 #else
@@ -45,7 +45,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 				case 2: mix[p] = vectorize2(shuffle[4], shuffle[5]); break;
 				case 3: mix[p] = vectorize2(shuffle[6], shuffle[7]); break;
 			}
-#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
+#if CUDA_VERSION < SHUFFLE_DEPRECATED
 			init0[p] = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);
 #else
 			init0[p] = __shfl_sync(0xFFFFFFFF,shuffle[0].x, 0, THREADS_PER_HASH);
@@ -61,7 +61,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 				for (int p = 0; p < _PARALLEL_HASH; p++)
 				{
 					offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size;
-#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
+#if CUDA_VERSION < SHUFFLE_DEPRECATED
 					offset[p] = __shfl(offset[p], t, THREADS_PER_HASH);
 #else
 					offset[p] = __shfl_sync(0xFFFFFFFF,offset[p], t, THREADS_PER_HASH);
@@ -85,7 +85,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 			uint32_t thread_mix = fnv_reduce(mix[p]);
 
 			// update mix accross threads
-#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
+#if CUDA_VERSION < SHUFFLE_DEPRECATED
 			shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH);
 			shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH);
 			shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH);
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 7ea5f7ed2e..7d3c89a11e 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -86,7 +86,7 @@ ethash_calculate_dag_item(uint32_t start)
 #else
 		for (uint32_t t = 0; t < 4; t++) {
 
-#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
+#if CUDA_VERSION < SHUFFLE_DEPRECATED
 			uint32_t shuffle_index = __shfl(parent_index, t, 4);
 #else
 			uint32_t shuffle_index = __shfl_sync(0xFFFFFFFF,parent_index, t, 4);
@@ -95,7 +95,7 @@ ethash_calculate_dag_item(uint32_t start)
 			uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
 			for (int w = 0; w < 4; w++) {
 
-#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
+#if CUDA_VERSION < SHUFFLE_DEPRECATED
 				uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
 #else
 				uint4 s4 = make_uint4(__shfl_sync(0xFFFFFFFF,p4.x, w, 4), __shfl_sync(0xFFFFFFFF,p4.y, w, 4), __shfl_sync(0xFFFFFFFF,p4.z, w, 4), __shfl_sync(0xFFFFFFFF,p4.w, w, 4));
@@ -117,14 +117,14 @@ ethash_calculate_dag_item(uint32_t start)
 	}
 #else
 	for (uint32_t t = 0; t < 4; t++) {
-#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
+#if CUDA_VERSION < SHUFFLE_DEPRECATED
 		uint32_t shuffle_index = __shfl(node_index, t, 4);
 #else
 		uint32_t shuffle_index = __shfl_sync(node_index, t, 4);
 #endif
 		uint4 s[4];
 		for (uint32_t w = 0; w < 4; w++) {
-#if __CUDA_ARCH__ < SHUFFLE_DEPRECATED
+#if CUDA_VERSION < SHUFFLE_DEPRECATED
 			s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));
 #else
 			s[w] = make_uint4(__shfl_sync(dag_node.uint4s[w].x, t, 4), __shfl_sync(dag_node.uint4s[w].y, t, 4), __shfl_sync(dag_node.uint4s[w].z, t, 4), __shfl_sync(dag_node.uint4s[w].w, t, 4));
diff --git a/libethash-cuda/ethash_cuda_miner_kernel_globals.h b/libethash-cuda/ethash_cuda_miner_kernel_globals.h
index 8f831e8d24..e467108666 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel_globals.h
+++ b/libethash-cuda/ethash_cuda_miner_kernel_globals.h
@@ -1,8 +1,8 @@
 #ifndef _ETHASH_CUDA_MINER_KERNEL_GLOBALS_H_
 #define _ETHASH_CUDA_MINER_KERNEL_GLOBALS_H_
 
-#define SHUFFLE_MIN_VER 300
-#define SHUFFLE_DEPRECATED 900
+#define SHUFFLE_MIN_VER 300 //__CUDA_ARCH_
+#define SHUFFLE_DEPRECATED 9000 //CUDA_VERSION
 
 //#include "cuda_helper.h"
 

From 5e63e339f45eb1c581b5d242faeca4d00f086507 Mon Sep 17 00:00:00 2001
From: Maciej <mbajkowski@hotmail.com>
Date: Sat, 4 Nov 2017 18:18:38 -0500
Subject: [PATCH 09/41] Update to fix stratum failover argument handling

The stratum-failover argument handling in the existing implementation is not consistent. The existing code implements: stratum-failover and FS. However, the help screen uses both --stratum-failover and --failover-stratum. --failover-stratum is not actually implemented in the code, only the abbreviation FS is. This change modifies the code to use SF and --stratum-failover to be more consistent across the board.
---
 ethminer/MinerAux.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h
index 0caacb52af..44dff3cd39 100644
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@@ -102,7 +102,7 @@ class MinerCLI
 			m_farmURL = argv[++i];
 			m_activeFarmURL = m_farmURL;
 		}
-		else if ((arg == "-FF" || arg == "-FS" || arg == "--farm-failover" || arg == "--stratum-failover") && i + 1 < argc)
+		else if ((arg == "-FF" || arg == "-SF" || arg == "--farm-failover" || arg == "--stratum-failover") && i + 1 < argc)
 		{
 			string url = argv[++i];
 
@@ -562,7 +562,7 @@ class MinerCLI
 			<< "	--farm-retries <n> Number of retries until switch to failover (default: 3)" << endl
 #if ETH_STRATUM
 			<< "	-S, --stratum <host:port>  Put into stratum mode with the stratum server at host:port" << endl
-			<< "	-FS, --failover-stratum <host:port>  Failover stratum server at host:port" << endl
+			<< "	-SF, --stratum-failover <host:port>  Failover stratum server at host:port" << endl
 			<< "    -O, --userpass <username.workername:password> Stratum login credentials" << endl
 			<< "    -FO, --failover-userpass <username.workername:password> Failover stratum login credentials (optional, will use normal credentials when omitted)" << endl
 			<< "    --work-timeout <n> reconnect/failover after n seconds of working on the same (stratum) job. Defaults to 180. Don't set lower than max. avg. block time" << endl

From 30aa051f996137fd00587c20d6275dcb7e7bd9e9 Mon Sep 17 00:00:00 2001
From: Maciej <mbajkowski@hotmail.com>
Date: Wed, 8 Nov 2017 09:46:24 -0600
Subject: [PATCH 10/41] updated for backwards compatibility with -FS argument

Re-introduced -FS into the arguments parsing, just so that it does not create problems with existing command lines that people may have
---
 ethminer/MinerAux.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ethminer/MinerAux.h b/ethminer/MinerAux.h
index 44dff3cd39..e525a86842 100644
--- a/ethminer/MinerAux.h
+++ b/ethminer/MinerAux.h
@@ -102,7 +102,7 @@ class MinerCLI
 			m_farmURL = argv[++i];
 			m_activeFarmURL = m_farmURL;
 		}
-		else if ((arg == "-FF" || arg == "-SF" || arg == "--farm-failover" || arg == "--stratum-failover") && i + 1 < argc)
+		else if ((arg == "-FF" || arg == "-SF" || arg == "-FS" || arg == "--farm-failover" || arg == "--stratum-failover") && i + 1 < argc)
 		{
 			string url = argv[++i];
 

From 1a3a07e5df7fbad33fa73eac925c6bab3b8429d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <chfast@gmail.com>
Date: Thu, 23 Nov 2017 17:27:47 +0100
Subject: [PATCH 11/41] Travis CI: Upgrade Xcode

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ab044b9770..d44c8519c4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -16,7 +16,7 @@ matrix:
           sudo: required
           env: CUDA=ON
         - os: osx
-          osx_image: xcode8.3
+          osx_image: xcode9.1
           env: CUDA=OFF
 cache:
     directories:
@@ -24,7 +24,7 @@ cache:
 before_install:
     - if [[ "$CUDA" = ON ]]; then source scripts/install-cuda-trusty.sh; fi
     - if [ "$TRAVIS_OS_NAME" = linux ]; then scripts/install_cmake.sh; fi
-    - pip install requests
+    - sudo pip install requests
 script:
     - cmake -DHUNTER_JOBS_NUMBER=4 -DETHASHCUDA=$CUDA -DETHASHCL=ON -DAPICORE=ON -H. -Bbuild
     - cmake --build build -- -j4

From a50e679e59f91f3c871d726959b41b96534c6332 Mon Sep 17 00:00:00 2001
From: gidoBOSSftw5731 <gidoBOSSftw5731@users.noreply.github.com>
Date: Fri, 24 Nov 2017 10:02:35 -0500
Subject: [PATCH 12/41] Extend FAQ in README (#313)

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8829082aa5..4aaaa4f07d 100644
--- a/README.md
+++ b/README.md
@@ -163,7 +163,7 @@ All bug reports, pull requests and code reviews are very much welcome.
 
 4. Can I still mine ETH with my 2GB GPU?
 
-   No.
+   Not really, your VRAM must be above the DAG size (Currently about 2.15 GB.) to get best performance. Without it severe hash loss will occur.
 
 5. What are the optimal launch parameters?
 
@@ -177,6 +177,9 @@ All bug reports, pull requests and code reviews are very much welcome.
 
    [Genoil's fork] was the original source of this version, but as Genoil is no longer consistently maintaining that fork it became almost impossible for developers to get new code merged there. In the interests of progressing development without waiting for reviews this fork should be considered the active one and Genoil's as legacy code.
 
+8. Can I CPU Mine?
+ 
+  No, use geth, the go program made for ethereum by ethereum.
 
 
 

From 83be30e78b324be420b5db55637888745b4ba3b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <chfast@gmail.com>
Date: Fri, 24 Nov 2017 16:50:37 +0100
Subject: [PATCH 13/41] Tracis CI: Upgrade CUDA to 9.0

---
 .travis.yml                                       |  2 +-
 ...-cuda-trusty.sh => install-cuda-ubuntu1604.sh} | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)
 rename scripts/{install-cuda-trusty.sh => install-cuda-ubuntu1604.sh} (54%)

diff --git a/.travis.yml b/.travis.yml
index d44c8519c4..07ad0e9a85 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,7 +22,7 @@ cache:
     directories:
         - $HOME/.local
 before_install:
-    - if [[ "$CUDA" = ON ]]; then source scripts/install-cuda-trusty.sh; fi
+    - if [ "$CUDA" = ON ]; then . scripts/install-cuda-ubuntu1604.sh; fi
     - if [ "$TRAVIS_OS_NAME" = linux ]; then scripts/install_cmake.sh; fi
     - sudo pip install requests
 script:
diff --git a/scripts/install-cuda-trusty.sh b/scripts/install-cuda-ubuntu1604.sh
similarity index 54%
rename from scripts/install-cuda-trusty.sh
rename to scripts/install-cuda-ubuntu1604.sh
index 6a6df53d41..69e7451335 100755
--- a/scripts/install-cuda-trusty.sh
+++ b/scripts/install-cuda-ubuntu1604.sh
@@ -8,13 +8,16 @@
 #
 # Taken from https://github.com/tmcdonell/travis-scripts.
 
-export CUDA_VER=8.0.61-1
-travis_retry wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_${CUDA_VER}_amd64.deb
-travis_retry sudo dpkg -i cuda-repo-ubuntu1404_${CUDA_VER}_amd64.deb
-travis_retry sudo apt-get update -qq
+set -e
+
+export CUDA_VER=9.0.176-1
+sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
+wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_${CUDA_VER}_amd64.deb
+sudo dpkg -i cuda-repo-ubuntu1604_${CUDA_VER}_amd64.deb
+sudo apt-get update -qq
 export CUDA_APT=${CUDA_VER:0:3}
 export CUDA_APT=${CUDA_APT/./-}
-travis_retry sudo apt-get install -qy cuda-core-${CUDA_APT} cuda-cudart-dev-${CUDA_APT}
-travis_retry sudo apt-get clean
+sudo apt-get install -qy cuda-core-${CUDA_APT} cuda-cudart-dev-${CUDA_APT}
+sudo apt-get clean
 export CUDA_HOME=/usr/local/cuda-${CUDA_VER:0:3}
 export PATH=${CUDA_HOME}/bin:${PATH}

From 6177c3baf298073e63d58137d7080b6672c23d72 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Fri, 24 Nov 2017 19:43:17 +0100
Subject: [PATCH 14/41] fixed dumb error

---
 libethash-cuda/ethash_cuda_miner_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 7d3c89a11e..d25cc14abe 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -120,14 +120,14 @@ ethash_calculate_dag_item(uint32_t start)
 #if CUDA_VERSION < SHUFFLE_DEPRECATED
 		uint32_t shuffle_index = __shfl(node_index, t, 4);
 #else
-		uint32_t shuffle_index = __shfl_sync(node_index, t, 4);
+		uint32_t shuffle_index = __shfl_sync(0xFFFFFFFF,node_index, t, 4);
 #endif
 		uint4 s[4];
 		for (uint32_t w = 0; w < 4; w++) {
 #if CUDA_VERSION < SHUFFLE_DEPRECATED
 			s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));
 #else
-			s[w] = make_uint4(__shfl_sync(dag_node.uint4s[w].x, t, 4), __shfl_sync(dag_node.uint4s[w].y, t, 4), __shfl_sync(dag_node.uint4s[w].z, t, 4), __shfl_sync(dag_node.uint4s[w].w, t, 4));
+			s[w] = make_uint4(__shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].x, t, 4), __shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].y, t, 4), __shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].z, t, 4), __shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].w, t, 4));
 #endif
 		}
 		dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];

From 1dbfb19ea4a752111ec576686105222fa15fb568 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Sat, 28 Oct 2017 17:51:28 +0200
Subject: [PATCH 15/41] Started on #359

---
 libethash-cuda/CUDAMiner.cpp               | 100 ++++++++++++---------
 libethash-cuda/CUDAMiner.h                 |   1 +
 libethash-cuda/ethash_cuda_miner.cpp       |  27 ++++--
 libethash-cuda/ethash_cuda_miner.h         |   1 +
 libethash-cuda/ethash_cuda_miner_kernel.cu |   2 +-
 5 files changed, 77 insertions(+), 54 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index f870615999..75ac9cde76 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -116,73 +116,85 @@ void CUDAMiner::kickOff()
 	startWorking();
 }
 
-void CUDAMiner::workLoop()
+void CUDAMiner::initDevice(WorkPackage w)
 {
 	// take local copy of work since it may end up being overwritten by kickOff/pause.
 	try {
-		WorkPackage w = work();
-		if (!w)
-			return;
-
-		cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
-		if (!m_miner || m_minerSeed != w.seed)
+		unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
+		if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
 		{
-			unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
-
-			if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
+			while (s_dagLoadIndex < index) {
+				this_thread::sleep_for(chrono::milliseconds(100)); //changed this from 1 second
+			}
+		}
+		else if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+		{
+			if (device != s_dagCreateDevice)
 			{
-				while (s_dagLoadIndex < index) {
-					this_thread::sleep_for(chrono::seconds(1));
+				// wait until DAG is created on selected device
+				while (s_dagInHostMemory == NULL) {
+					this_thread::sleep_for(chrono::milliseconds(100));//changed this from 1 second
 				}
 			}
-			else if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+			else
 			{
-				if (device != s_dagCreateDevice)
-				{
-					// wait until DAG is created on selected device
-					while (s_dagInHostMemory == NULL) {
-						this_thread::sleep_for(chrono::seconds(1));
-					}
-				}
-				else
-				{
-					// reset load index
-					s_dagLoadIndex = 0;
-				}
+				// reset load index
+				s_dagLoadIndex = 0;
 			}
+		}
 
-			cnote << "Initialising miner...";
-			m_minerSeed = w.seed;
+		cnote << "Initialising miner...";
+		m_minerSeed = w.seed;
 
-			delete m_miner;
-			m_miner = new ethash_cuda_miner;
+		delete m_miner;
+		m_miner = new ethash_cuda_miner;
 
-			EthashAux::LightType light;
-			light = EthashAux::light(w.seed);
-			//bytesConstRef dagData = dag->data();
-			bytesConstRef lightData = light->data();
+		EthashAux::LightType light;
+		light = EthashAux::light(w.seed);
+		//bytesConstRef dagData = dag->data();
+		bytesConstRef lightData = light->data();
 
-			m_miner->init(light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory);
-			s_dagLoadIndex++;
+		m_miner->init(light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory);
+		s_dagLoadIndex++;
 
-			if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+		if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+		{
+			if (s_dagLoadIndex >= s_numInstances && s_dagInHostMemory)
 			{
-				if (s_dagLoadIndex >= s_numInstances && s_dagInHostMemory)
-				{
-					// all devices have loaded DAG, we can free now
-					delete[] s_dagInHostMemory;
-					s_dagInHostMemory = NULL;
-
-					cout << "Freeing DAG from host" << endl;
-				}
+				// all devices have loaded DAG, we can free now
+				delete[] s_dagInHostMemory;
+				s_dagInHostMemory = NULL;
+				cout << "Freeing DAG from host" << endl;
 			}
 		}
 
+	}
+	catch (std::runtime_error const& _e)
+	{
+		delete m_miner;
+		m_miner = nullptr;
+		cwarn << "Error CUDA mining: " << _e.what();
+	}
+}
+
+void CUDAMiner::workLoop()
+{
+	try
+	{
+		WorkPackage w = work();
+		if(!w)
+			return;
+		cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
+		if (!m_miner || m_minerSeed != w.seed)
+		{
+			initDevice(w);
+		}
 		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
 		uint64_t startN = w.startNonce;
 		if (w.exSizeBits >= 0)
 			startN = w.startNonce | ((uint64_t)index << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
 		m_miner->search(w.header.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
+
 	}
 	catch (std::runtime_error const& _e)
 	{
diff --git a/libethash-cuda/CUDAMiner.h b/libethash-cuda/CUDAMiner.h
index 82226cc2a3..5157e2297e 100644
--- a/libethash-cuda/CUDAMiner.h
+++ b/libethash-cuda/CUDAMiner.h
@@ -76,6 +76,7 @@ class EthashCUDAHook;
 	private:
 		void workLoop() override;
 		void report(uint64_t _nonce);
+		void initDevice(WorkPackage w);
 
 		EthashCUDAHook* m_hook = nullptr;
 		ethash_cuda_miner* m_miner = nullptr;
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index d2d7fe45b2..d36075eacc 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -205,24 +205,19 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 {
 	try
 	{
-		int device_count = getNumDevices();
+		unsigned device_count = getNumDevices();
 
 		if (device_count == 0)
 			return false;
 
 		// use selected device
-		int device_num = std::min<int>((int)_deviceId, device_count - 1);
+		int device_num = _deviceId < device_count -1 ? _deviceId : device_count - 1;
 
 		cudaDeviceProp device_props;
 		CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));
 
 		cudalog << "Using device: " << device_props.name << " (Compute " + to_string(device_props.major) + "." + to_string(device_props.minor) + ")";
 
-		CUDA_SAFE_CALL(cudaSetDevice(device_num));
-		CUDA_SAFE_CALL(cudaDeviceReset());
-		CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
-		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
-
 		m_search_buf = new volatile uint32_t *[s_numStreams];
 		m_streams = new cudaStream_t[s_numStreams];
 
@@ -230,13 +225,27 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		uint32_t dagSize128   = (unsigned)(dagSize / ETHASH_MIX_BYTES);
 		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
 
+		if(dagSize != m_current_dagSize)
+		{
+			//We need to reset the device
+		}else
+		{
+			//We only need to reset the light 
+		}
+		CUDA_SAFE_CALL(cudaSetDevice(device_num));
+		CUDA_SAFE_CALL(cudaDeviceReset());
+		CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
+		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+
+
+
 		// create buffer for cache
 		hash64_t * light = NULL;
 
 		if (!*hostDAG)
 		{
 			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
-			// copy dag cache to CPU.
+			// copy lightData to device
 			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
 		}
 
@@ -278,7 +287,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			const void* hdag = (const void*)(*hostDAG);
 			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
 		}
-
+		m_current_dagSize = dagSize;
 		return true;
 	}
 	catch (runtime_error const&)
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 65134cff43..170ac3f7e6 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -54,6 +54,7 @@ class ethash_cuda_miner
 	uint64_t m_current_nonce;
 	uint64_t m_starting_nonce;
 	uint64_t m_current_index;
+	uint64_t m_current_dagSize;
 
 	uint32_t m_sharedBytes;
 
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index d25cc14abe..1fcffbab29 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -100,7 +100,7 @@ ethash_calculate_dag_item(uint32_t start)
 #else
 				uint4 s4 = make_uint4(__shfl_sync(0xFFFFFFFF,p4.x, w, 4), __shfl_sync(0xFFFFFFFF,p4.y, w, 4), __shfl_sync(0xFFFFFFFF,p4.z, w, 4), __shfl_sync(0xFFFFFFFF,p4.w, w, 4));
 #endif
-				if (t == thread_id) {
+				if (t == thread_id) { //potential performance leak (branching in warp)
 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
 				}
 			}

From a62bd1f2072e70b6bba5b41946de9e6b3d6354c4 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 23 Nov 2017 12:10:49 +0100
Subject: [PATCH 16/41] Rewrote CUDAMiner

---
 libethash-cuda/CUDAMiner.cpp               | 47 ++++++++++++++--------
 libethash-cuda/CUDAMiner.h                 |  2 +
 libethash-cuda/ethash_cuda_miner.cpp       |  2 +-
 libethash-cuda/ethash_cuda_miner_kernel.cu |  5 ++-
 4 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 75ac9cde76..8b2c76a5ce 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -116,7 +116,7 @@ void CUDAMiner::kickOff()
 	startWorking();
 }
 
-void CUDAMiner::initDevice(WorkPackage w)
+bool CUDAMiner::init(const h256& seed)
 {
 	// take local copy of work since it may end up being overwritten by kickOff/pause.
 	try {
@@ -144,13 +144,13 @@ void CUDAMiner::initDevice(WorkPackage w)
 		}
 
 		cnote << "Initialising miner...";
-		m_minerSeed = w.seed;
+		m_minerSeed = seed;
 
 		delete m_miner;
 		m_miner = new ethash_cuda_miner;
 
 		EthashAux::LightType light;
-		light = EthashAux::light(w.seed);
+		light = EthashAux::light(seed);
 		//bytesConstRef dagData = dag->data();
 		bytesConstRef lightData = light->data();
 
@@ -167,34 +167,49 @@ void CUDAMiner::initDevice(WorkPackage w)
 				cout << "Freeing DAG from host" << endl;
 			}
 		}
-
+		return true;
 	}
 	catch (std::runtime_error const& _e)
 	{
 		delete m_miner;
 		m_miner = nullptr;
 		cwarn << "Error CUDA mining: " << _e.what();
+		return false;
 	}
 }
 
 void CUDAMiner::workLoop()
 {
+	WorkPackage current;
+	current.header = h256{1u};
+	current.seed = h256{1u};
 	try
 	{
-		WorkPackage w = work();
-		if(!w)
-			return;
-		cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
-		if (!m_miner || m_minerSeed != w.seed)
+		while(true)
 		{
-			initDevice(w);
+			WorkPackage w = work();
+			if(current.header != w.header)
+			{
+				if(!w)
+				{
+					cnote << "No work. Pause for 3 s.";
+					std::this_thread::sleep_for(std::chrono::seconds(3));
+					continue;
+				}
+				
+				cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
+				if (current.seed != w.seed || !m_miner)
+				{
+					init(w.seed);
+				}
+			}
+			
+			uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
+			uint64_t startN = w.startNonce;
+			if (w.exSizeBits >= 0)
+				startN = w.startNonce | ((uint64_t)index << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
+			m_miner->search(w.header.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
 		}
-		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
-		uint64_t startN = w.startNonce;
-		if (w.exSizeBits >= 0)
-			startN = w.startNonce | ((uint64_t)index << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
-		m_miner->search(w.header.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
-
 	}
 	catch (std::runtime_error const& _e)
 	{
diff --git a/libethash-cuda/CUDAMiner.h b/libethash-cuda/CUDAMiner.h
index 5157e2297e..ce4a3255e8 100644
--- a/libethash-cuda/CUDAMiner.h
+++ b/libethash-cuda/CUDAMiner.h
@@ -78,6 +78,8 @@ class EthashCUDAHook;
 		void report(uint64_t _nonce);
 		void initDevice(WorkPackage w);
 
+		bool init(const h256& seed);
+
 		EthashCUDAHook* m_hook = nullptr;
 		ethash_cuda_miner* m_miner = nullptr;
 
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index d36075eacc..2da902ed30 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -259,7 +259,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
 			CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
 		}
-		set_constants(dag, dagSize128, light, lightSize64);
+		set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
 		memset(&m_current_header, 0, sizeof(hash32_t));
 		m_current_target = 0;
 		m_current_nonce = 0;
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 1fcffbab29..5ae6f89649 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -100,12 +100,13 @@ ethash_calculate_dag_item(uint32_t start)
 #else
 				uint4 s4 = make_uint4(__shfl_sync(0xFFFFFFFF,p4.x, w, 4), __shfl_sync(0xFFFFFFFF,p4.y, w, 4), __shfl_sync(0xFFFFFFFF,p4.z, w, 4), __shfl_sync(0xFFFFFFFF,p4.w, w, 4));
 #endif
-				if (t == thread_id) { //potential performance leak (branching in warp)
+				if (t == thread_id) {
 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
 				}
 			}
-
 		}
+
+
 #endif		
 	}
 	SHA3_512(dag_node.uint2s);

From a7e8dca74f481686411a1ba559d559ce515cd6d4 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Mon, 27 Nov 2017 12:03:55 +0100
Subject: [PATCH 17/41] minor fix

---
 libethash-cuda/CUDAMiner.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 8b2c76a5ce..e4a117c187 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -187,7 +187,8 @@ void CUDAMiner::workLoop()
 	{
 		while(true)
 		{
-			WorkPackage w = work();
+			const WorkPackage w = work();
+			
 			if(current.header != w.header)
 			{
 				if(!w)
@@ -203,7 +204,7 @@ void CUDAMiner::workLoop()
 					init(w.seed);
 				}
 			}
-			
+			current = w;
 			uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
 			uint64_t startN = w.startNonce;
 			if (w.exSizeBits >= 0)

From 946783cdfacabe324239aadf048893e8f90f3a95 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Sat, 2 Dec 2017 22:38:04 +0100
Subject: [PATCH 18/41] rewrote the kernel once again

---
 libethash-cuda/CUDAMiner.cpp         | 23 ++++---
 libethash-cuda/ethash_cuda_miner.cpp | 93 ++++++++++++++--------------
 libethash-cuda/ethash_cuda_miner.h   |  5 +-
 3 files changed, 66 insertions(+), 55 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index e4a117c187..0abbcb92e0 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -17,6 +17,8 @@ along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
 /** @file CUDAMiner.cpp
 * @author Gav Wood <i@gavwood.com>
 * @date 2014
+* @author MariusVanDerWijden
+* @date 2017
 *
 * Determines the PoW algorithm.
 */
@@ -112,8 +114,8 @@ void CUDAMiner::report(uint64_t _nonce)
 
 void CUDAMiner::kickOff()
 {
-	m_hook->reset();
-	startWorking();
+	//m_hook->reset();
+	//startWorking();
 }
 
 bool CUDAMiner::init(const h256& seed)
@@ -146,8 +148,10 @@ bool CUDAMiner::init(const h256& seed)
 		cnote << "Initialising miner...";
 		m_minerSeed = seed;
 
-		delete m_miner;
-		m_miner = new ethash_cuda_miner;
+		//delete m_miner;
+		//m_miner = new ethash_cuda_miner;
+		if(!m_miner)
+			m_miner = new ethash_cuda_miner;
 
 		EthashAux::LightType light;
 		light = EthashAux::light(seed);
@@ -189,7 +193,7 @@ void CUDAMiner::workLoop()
 		{
 			const WorkPackage w = work();
 			
-			if(current.header != w.header)
+			if(!m_miner || current.header != w.header)
 			{
 				if(!w)
 				{
@@ -199,9 +203,10 @@ void CUDAMiner::workLoop()
 				}
 				
 				cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
-				if (current.seed != w.seed || !m_miner)
+				if (!m_miner || current.seed != w.seed)
 				{
-					init(w.seed);
+					if(!init(w.seed))
+						break;
 				}
 			}
 			current = w;
@@ -222,8 +227,8 @@ void CUDAMiner::workLoop()
 
 void CUDAMiner::pause()
 {
-	m_hook->abort();
-	stopWorking();
+	//m_hook->abort();
+	//stopWorking();
 }
 
 std::string CUDAMiner::platformInfo()
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 2da902ed30..28a0182f89 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -17,6 +17,8 @@
 /** @file ethash_cuda_miner.cpp
 * @author Genoil <jw@meneer.net>
 * @date 2015
+* @coauthor MariusVanDerWijden
+* @date 2017
 */
 
 
@@ -225,20 +227,6 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		uint32_t dagSize128   = (unsigned)(dagSize / ETHASH_MIX_BYTES);
 		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
 
-		if(dagSize != m_current_dagSize)
-		{
-			//We need to reset the device
-		}else
-		{
-			//We only need to reset the light 
-		}
-		CUDA_SAFE_CALL(cudaSetDevice(device_num));
-		CUDA_SAFE_CALL(cudaDeviceReset());
-		CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
-		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
-
-
-
 		// create buffer for cache
 		hash64_t * light = NULL;
 
@@ -248,46 +236,61 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			// copy lightData to device
 			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
 		}
-
-		// create buffer for dag
-		hash128_t * dag;
-		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
 		
-		// create mining buffers
-		for (unsigned i = 0; i != s_numStreams; ++i)
+		
+		hash128_t * dag = m_dag;
+		if(dagSize128 != m_dag_size || !dag)
 		{
-			CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
-			CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
-		}
-		set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
-		memset(&m_current_header, 0, sizeof(hash32_t));
-		m_current_target = 0;
-		m_current_nonce = 0;
-		m_current_index = 0;
-
-		m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
+			//We need to reset the device and recreate the dag
+			CUDA_SAFE_CALL(cudaSetDevice(device_num));
+			CUDA_SAFE_CALL(cudaDeviceReset());
+			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
+			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+			
+			// create buffer for dag
+			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
+			
+			// create mining buffers
+			for (unsigned i = 0; i != s_numStreams; ++i)
+			{
+				CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
+				CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
+			}
+			
+			memset(&m_current_header, 0, sizeof(hash32_t));
+			m_current_target = 0;
+			m_current_nonce = 0;
+			m_current_index = 0;
 
-		if (!*hostDAG)
-		{
-			cudalog << "Generating DAG for GPU #" << device_num;
-			ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], device_num);
+			m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
 
-			if (_cpyToHost)
+			if (!*hostDAG)
 			{
-				uint8_t* memoryDAG = new uint8_t[dagSize];
-				cudalog << "Copying DAG from GPU #" << device_num << " to host";
-				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
+				cudalog << "Generating DAG for GPU #" << device_num;
+				ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], device_num);
+
+				if (_cpyToHost)
+				{
+					uint8_t* memoryDAG = new uint8_t[dagSize];
+					cudalog << "Copying DAG from GPU #" << device_num << " to host";
+					CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
 
-				*hostDAG = (void*)memoryDAG;
+					*hostDAG = (void*)memoryDAG;
+				}
 			}
-		}
-		else
+			else
+			{
+				cudalog << "Copying DAG from host to GPU #" << device_num;
+				const void* hdag = (const void*)(*hostDAG);
+				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
+			}
+		}else
 		{
-			cudalog << "Copying DAG from host to GPU #" << device_num;
-			const void* hdag = (const void*)(*hostDAG);
-			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
+			//We only need to reset the light 
 		}
-		m_current_dagSize = dagSize;
+		set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
+		m_dag = dag;
+		m_dag_size = dagSize128;
 		return true;
 	}
 	catch (runtime_error const&)
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 170ac3f7e6..c22760b64f 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -54,9 +54,12 @@ class ethash_cuda_miner
 	uint64_t m_current_nonce;
 	uint64_t m_starting_nonce;
 	uint64_t m_current_index;
-	uint64_t m_current_dagSize;
 
 	uint32_t m_sharedBytes;
+	
+	///Constants on GPU
+	hash128_t* m_dag;
+	uint32_t m_dag_size;
 
 	volatile uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;

From 82d0daf72bbfb4664550e13edbf0d1899d4dd17f Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Sun, 3 Dec 2017 19:32:34 +0100
Subject: [PATCH 19/41] minor improvements

---
 libethash-cuda/ethash_cuda_miner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 28a0182f89..249cbfd02c 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -213,7 +213,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			return false;
 
 		// use selected device
-		int device_num = _deviceId < device_count -1 ? _deviceId : device_count - 1;
+		unsigned device_num = _deviceId < device_count -1 ? _deviceId : device_count - 1;
 
 		cudaDeviceProp device_props;
 		CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));

From 87d13645437dbc4e210a07ba5ec78a82b0f90566 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Sun, 3 Dec 2017 19:33:40 +0100
Subject: [PATCH 20/41] minor improvements

---
 libethash-cuda/CUDAMiner.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 0abbcb92e0..72551985e6 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -64,7 +64,8 @@ namespace eth
 	protected:
 		virtual bool found(uint64_t const* _nonces, uint32_t _count) override
 		{
-			m_owner.report(_nonces[0]);
+			for(int i = 0; i < _count; i++)
+				m_owner.report(_nonces[i]); //changed this to report all found nonces
 			return m_owner.shouldStop();
 		}
 
@@ -195,7 +196,7 @@ void CUDAMiner::workLoop()
 			
 			if(!m_miner || current.header != w.header)
 			{
-				if(!w)
+				if(!w || w.header == h256())
 				{
 					cnote << "No work. Pause for 3 s.";
 					std::this_thread::sleep_for(std::chrono::seconds(3));

From 2e0f674efdad2293ff5631ee0b24a57add8a5f4a Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Mon, 4 Dec 2017 14:28:33 +0100
Subject: [PATCH 21/41] restructured bad code

---
 libethash-cuda/CUDAMiner.cpp         | 43 ++++++++++++++--------------
 libethash-cuda/ethash_cuda_miner.cpp | 19 +++++++-----
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 3277de955c..21ba45fe07 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -124,27 +124,6 @@ bool CUDAMiner::init(const h256& seed)
 	// take local copy of work since it may end up being overwritten by kickOff/pause.
 	try {
 		unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
-		if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
-		{
-			while (s_dagLoadIndex < index) {
-				this_thread::sleep_for(chrono::milliseconds(100)); //changed this from 1 second
-			}
-		}
-		else if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
-		{
-			if (device != s_dagCreateDevice)
-			{
-				// wait until DAG is created on selected device
-				while (s_dagInHostMemory == NULL) {
-					this_thread::sleep_for(chrono::milliseconds(100));//changed this from 1 second
-				}
-			}
-			else
-			{
-				// reset load index
-				s_dagLoadIndex = 0;
-			}
-		}
 
 		cnote << "Initialising miner...";
 		m_minerSeed = seed;
@@ -206,6 +185,28 @@ void CUDAMiner::workLoop()
 				cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
 				if (!m_miner || current.seed != w.seed)
 				{
+					unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
+					if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
+					{
+						while (s_dagLoadIndex < index) {
+							this_thread::sleep_for(chrono::milliseconds(100)); 
+						}
+					}
+					else if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+					{
+						if (device != s_dagCreateDevice)
+						{
+							// wait until DAG is created on selected device
+							while (s_dagInHostMemory == NULL) {
+								this_thread::sleep_for(chrono::milliseconds(100));
+							}
+						}
+						else
+						{
+							// reset load index
+							s_dagLoadIndex = 0;
+						}
+					}
 					if(!init(w.seed))
 						break;
 				}
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 94cf2fec8b..4b221ac255 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -217,7 +217,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		nvmlh = wrap_nvml_create();
 
 		cudaDeviceProp device_props;
-		CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, m_device_num));
+		CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));
 
 		cudalog << "Using device: " << device_props.name << " (Compute " + to_string(device_props.major) + "." + to_string(device_props.minor) + ")";
 
@@ -230,6 +230,16 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 		// create buffer for cache
 		hash64_t * light = NULL;
+		
+		if(dagSize128 != m_dag_size || !dag || s_dagLoadIndex < index)
+		{
+			//We need to reset the device and recreate the dag  
+			//move this before light alloc!!!
+			CUDA_SAFE_CALL(cudaSetDevice(device_num));
+			CUDA_SAFE_CALL(cudaDeviceReset());
+			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
+			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+		}
 
 		if (!*hostDAG)
 		{
@@ -240,13 +250,8 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		
 		
 		hash128_t * dag = m_dag;
-		if(dagSize128 != m_dag_size || !dag)
+		if(dagSize128 != m_dag_size || !dag || s_dagLoadIndex < index)
 		{
-			//We need to reset the device and recreate the dag
-			CUDA_SAFE_CALL(cudaSetDevice(device_num));
-			CUDA_SAFE_CALL(cudaDeviceReset());
-			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
-			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
 			
 			// create buffer for dag
 			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));

From 923ff2c3b97260484880b54536cebb5091d4202c Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Mon, 4 Dec 2017 15:37:30 +0100
Subject: [PATCH 22/41] fixed compile errors

---
 libethash-cuda/CUDAMiner.cpp         |  2 +-
 libethash-cuda/ethash_cuda_miner.cpp | 10 +++++-----
 libethash-cuda/ethash_cuda_miner.h   |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 21ba45fe07..75409a30d1 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -138,7 +138,7 @@ bool CUDAMiner::init(const h256& seed)
 		//bytesConstRef dagData = dag->data();
 		bytesConstRef lightData = light->data();
 
-		m_miner->init(light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory);
+		m_miner->init(light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory, s_dagLoadIndex < index);
 		s_dagLoadIndex++;
 
 		if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 4b221ac255..8f87a4a3ef 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -203,7 +203,7 @@ void ethash_cuda_miner::finish()
 	CUDA_SAFE_CALL(cudaDeviceReset());
 }
 
-bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG)
+bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG, bool reloadDag)
 {
 	try
 	{
@@ -230,12 +230,13 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 		// create buffer for cache
 		hash64_t * light = NULL;
+		hash128_t * dag = m_dag;
 		
-		if(dagSize128 != m_dag_size || !dag || s_dagLoadIndex < index)
+		CUDA_SAFE_CALL(cudaSetDevice(device_num));
+		if(dagSize128 != m_dag_size || !dag || reloadDag)
 		{
 			//We need to reset the device and recreate the dag  
 			//move this before light alloc!!!
-			CUDA_SAFE_CALL(cudaSetDevice(device_num));
 			CUDA_SAFE_CALL(cudaDeviceReset());
 			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
 			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
@@ -249,8 +250,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		}
 		
 		
-		hash128_t * dag = m_dag;
-		if(dagSize128 != m_dag_size || !dag || s_dagLoadIndex < index)
+		if(dagSize128 != m_dag_size || !dag || reloadDag)
 		{
 			
 			// create buffer for dag
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 364ae1b218..b5a46bc0b8 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -37,7 +37,7 @@ class ethash_cuda_miner
 		);
         static void setParallelHash(unsigned _parallelHash);
 
-	bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG);
+	bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG,bool reloadDag);
 
 	void finish();
 	void search(uint8_t const* header, uint64_t target, search_hook& hook, bool _ethStratum, uint64_t _startN);

From 89eb45afc76952bb73ba7e649c16580566e9a51c Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Mon, 4 Dec 2017 21:30:29 +0100
Subject: [PATCH 23/41] restructure code, added output to find errors

---
 libethash-cuda/CUDAMiner.cpp               |  8 +++--
 libethash-cuda/ethash_cuda_miner.cpp       | 36 +++++++++++++---------
 libethash-cuda/ethash_cuda_miner_kernel.cu |  3 +-
 3 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 75409a30d1..aa490cdcd5 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -138,7 +138,8 @@ bool CUDAMiner::init(const h256& seed)
 		//bytesConstRef dagData = dag->data();
 		bytesConstRef lightData = light->data();
 
-		m_miner->init(light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory, s_dagLoadIndex < index);
+		m_miner->init(light->light, lightData.data(), lightData.size(), 
+			device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory, device == 0);
 		s_dagLoadIndex++;
 
 		if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
@@ -184,8 +185,9 @@ void CUDAMiner::workLoop()
 				
 				cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
 				if (!m_miner || current.seed != w.seed)
-				{
+				{/*
 					unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
+					
 					if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
 					{
 						while (s_dagLoadIndex < index) {
@@ -206,7 +208,7 @@ void CUDAMiner::workLoop()
 							// reset load index
 							s_dagLoadIndex = 0;
 						}
-					}
+					}*/
 					if(!init(w.seed))
 						break;
 				}
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 8f87a4a3ef..bc3bae1cbf 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -203,7 +203,7 @@ void ethash_cuda_miner::finish()
 	CUDA_SAFE_CALL(cudaDeviceReset());
 }
 
-bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG, bool reloadDag)
+bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG, bool genDag)
 {
 	try
 	{
@@ -233,10 +233,11 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		hash128_t * dag = m_dag;
 		
 		CUDA_SAFE_CALL(cudaSetDevice(device_num));
-		if(dagSize128 != m_dag_size || !dag || reloadDag)
+		cudalog << "Set Device to current";
+		if(dagSize128 != m_dag_size || !dag)
 		{
 			//We need to reset the device and recreate the dag  
-			//move this before light alloc!!!
+			cudalog << "Resetting device: ";
 			CUDA_SAFE_CALL(cudaDeviceReset());
 			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
 			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
@@ -250,9 +251,8 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		}
 		
 		
-		if(dagSize128 != m_dag_size || !dag || reloadDag)
+		if(dagSize128 != m_dag_size || !dag)
 		{
-			
 			// create buffer for dag
 			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
 			
@@ -272,16 +272,22 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 			if (!*hostDAG)
 			{
-				cudalog << "Generating DAG for GPU #" << device_num;
-				ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], device_num);
-
-				if (_cpyToHost)
-				{
-					uint8_t* memoryDAG = new uint8_t[dagSize];
-					cudalog << "Copying DAG from GPU #" << device_num << " to host";
-					CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
-
-					*hostDAG = (void*)memoryDAG;
+				if(genDag || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
+					cudalog << "Generating DAG for GPU #" << device_num << " with dagSize: " 
+							<< dagSize <<" gridSize: " << s_gridSize << " &m_streams[0]: " << &m_streams[0];
+					ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], device_num);
+
+					if (_cpyToHost)
+					{
+						uint8_t* memoryDAG = new uint8_t[dagSize];
+						cudalog << "Copying DAG from GPU #" << device_num << " to host";
+						CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
+
+						*hostDAG = (void*)memoryDAG;
+					}
+				}else{
+					while(!*hostDAG)
+						this_thread::sleep_for(chrono::milliseconds(100)); 
 				}
 			}
 			else
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 5ae6f89649..496f44323d 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -152,7 +152,8 @@ void ethash_generate_dag(
 	for (uint32_t i = 0; i < fullRuns; i++)
 	{
 		ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads);
-		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+		printf("CUDA#%d: %.0f%%\n",device, 100.0f * (float)i / (float)fullRuns); //TODO delete this
+		CUDA_SAFE_CALL(cudaDeviceSynchronize()); //illegal mem access	
 		printf("CUDA#%d: %.0f%%\n",device, 100.0f * (float)i / (float)fullRuns);
 	}
 	//printf("GPU#%d 100%%\n");

From 546b6758ea5301ffac8751e97cc589c3ec90c786 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Tue, 5 Dec 2017 11:05:11 +0100
Subject: [PATCH 24/41] minor change

---
 libethash-cuda/CUDAMiner.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index aa490cdcd5..edc504d095 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -64,8 +64,7 @@ namespace eth
 	protected:
 		virtual bool found(uint64_t const* _nonces, uint32_t _count) override
 		{
-			for(int i = 0; i < _count; i++)
-				m_owner.report(_nonces[i]); //changed this to report all found nonces
+			m_owner.report(_nonces[0]);
 			return m_owner.shouldStop();
 		}
 

From 2c2dedd406ace181e14a790da98ab7c6906313cb Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Tue, 5 Dec 2017 20:53:48 +0100
Subject: [PATCH 25/41] fixed another error

---
 libethash-cuda/ethash_cuda_miner.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index bc3bae1cbf..59c93c1d95 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -255,7 +255,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		{
 			// create buffer for dag
 			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
-			
+			set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
 			// create mining buffers
 			for (unsigned i = 0; i != s_numStreams; ++i)
 			{
@@ -300,7 +300,6 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		{
 			//We only need to reset the light 
 		}
-		set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
 		m_dag = dag;
 		m_dag_size = dagSize128;
 		return true;

From 0852ce293a5ffdfd588135f08937245f94368e28 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Tue, 5 Dec 2017 21:20:00 +0100
Subject: [PATCH 26/41] minor update

---
 libethash-cuda/ethash_cuda_miner.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 59c93c1d95..0086bc8a5d 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -288,10 +288,12 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 				}else{
 					while(!*hostDAG)
 						this_thread::sleep_for(chrono::milliseconds(100)); 
+					goto cpyDag;
 				}
 			}
 			else
 			{
+cpyDag:
 				cudalog << "Copying DAG from host to GPU #" << device_num;
 				const void* hdag = (const void*)(*hostDAG);
 				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));

From c710eea3718df7dd946f0a7062900d452bebf3c7 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Tue, 5 Dec 2017 22:16:55 +0100
Subject: [PATCH 27/41] another fix

---
 libethash-cuda/CUDAMiner.cpp               | 37 ++++------------------
 libethash-cuda/ethash_cuda_miner_kernel.cu |  3 +-
 2 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index edc504d095..a7f41a7167 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -184,40 +184,17 @@ void CUDAMiner::workLoop()
 				
 				cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
 				if (!m_miner || current.seed != w.seed)
-				{/*
-					unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
-					
-					if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
-					{
-						while (s_dagLoadIndex < index) {
-							this_thread::sleep_for(chrono::milliseconds(100)); 
-						}
-					}
-					else if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
-					{
-						if (device != s_dagCreateDevice)
-						{
-							// wait until DAG is created on selected device
-							while (s_dagInHostMemory == NULL) {
-								this_thread::sleep_for(chrono::milliseconds(100));
-							}
-						}
-						else
-						{
-							// reset load index
-							s_dagLoadIndex = 0;
-						}
-					}*/
+				{
 					if(!init(w.seed))
 						break;
 				}
+				current = w;
 			}
-			current = w;
-			uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
-			uint64_t startN = w.startNonce;
-			if (w.exSizeBits >= 0)
-				startN = w.startNonce | ((uint64_t)index << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
-			m_miner->search(w.header.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
+			uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)current.boundary >> 192);
+			uint64_t startN = current.startNonce;
+			if (current.exSizeBits >= 0)
+				startN = current.startNonce | ((uint64_t)index << (64 - 4 - current.exSizeBits)); // this can support up to 16 devices
+			m_miner->search(current.header.data(), upper64OfBoundary, *m_hook, (current.exSizeBits >= 0), startN);
 		}
 	}
 	catch (std::runtime_error const& _e)
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index 496f44323d..5ae6f89649 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -152,8 +152,7 @@ void ethash_generate_dag(
 	for (uint32_t i = 0; i < fullRuns; i++)
 	{
 		ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads);
-		printf("CUDA#%d: %.0f%%\n",device, 100.0f * (float)i / (float)fullRuns); //TODO delete this
-		CUDA_SAFE_CALL(cudaDeviceSynchronize()); //illegal mem access	
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
 		printf("CUDA#%d: %.0f%%\n",device, 100.0f * (float)i / (float)fullRuns);
 	}
 	//printf("GPU#%d 100%%\n");

From 9f2d2f1a0104e3bab3dc84251f7807c91113e275 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Wed, 6 Dec 2017 12:13:11 +0100
Subject: [PATCH 28/41] rearanged code and fixed a guard

---
 libethash-cuda/CUDAMiner.cpp         |  2 +-
 libethash-cuda/ethash_cuda_miner.cpp | 16 +++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index a7f41a7167..e8a38fa24d 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -173,7 +173,7 @@ void CUDAMiner::workLoop()
 		{
 			const WorkPackage w = work();
 			
-			if(!m_miner || current.header != w.header)
+			if(!m_miner || current.seed != w.seed)
 			{
 				if(!w || w.header == h256())
 				{
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 0086bc8a5d..b444cada13 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -243,19 +243,17 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
 		}
 
-		if (!*hostDAG)
-		{
-			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
-			// copy lightData to device
-			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
-		}
+		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
+		// copy lightData to device
+		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
 		
+		if(dagSize128 != m_dag_size || !dag) // create buffer for dag
+			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
+			
+		set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
 		
 		if(dagSize128 != m_dag_size || !dag)
 		{
-			// create buffer for dag
-			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
-			set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
 			// create mining buffers
 			for (unsigned i = 0; i != s_numStreams; ++i)
 			{

From 81398cfab598d73baddb7b55ac9f6a14debe00f0 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Fri, 8 Dec 2017 09:38:16 +0100
Subject: [PATCH 29/41] Restructured Light Allocation

---
 libethash-cuda/ethash_cuda_miner.cpp | 10 ++++++++--
 libethash-cuda/ethash_cuda_miner.h   |  3 +++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index b444cada13..d0566d596d 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -66,6 +66,8 @@ ethash_cuda_miner::search_hook::~search_hook() {}
 
 ethash_cuda_miner::ethash_cuda_miner()
 {
+	int devicesCount = getNumDevices();
+	m_light = new hash64_t*[devicesCount];
 }
 
 std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
@@ -229,7 +231,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
 
 		// create buffer for cache
-		hash64_t * light = NULL;
+		hash64_t * light = m_light[device_num];
 		hash128_t * dag = m_dag;
 		
 		CUDA_SAFE_CALL(cudaSetDevice(device_num));
@@ -243,9 +245,13 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
 		}
 
-		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
+		if(!light){ 
+			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
+			cudalog << "Allocating light with size: " << _lightSize;
+		}
 		// copy lightData to device
 		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
+		m_light[device_num] = light;
 		
 		if(dagSize128 != m_dag_size || !dag) // create buffer for dag
 			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index b5a46bc0b8..6db52653ac 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -61,8 +61,11 @@ class ethash_cuda_miner
 	
 	///Constants on GPU
 	hash128_t* m_dag;
+	hash64_t ** m_light;
 	uint32_t m_dag_size;
 	int m_device_num;
+	
+	
 
 	volatile uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;

From 45b615a242974a342a4d0250968c93e1df8ca7af Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Fri, 8 Dec 2017 18:26:08 +0100
Subject: [PATCH 30/41] More output to debug

---
 libethash-cuda/ethash_cuda_miner.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index d0566d596d..077fe27556 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -17,7 +17,7 @@
 /** @file ethash_cuda_miner.cpp
 * @author Genoil <jw@meneer.net>
 * @date 2015
-* @coauthor MariusVanDerWijden
+* @author MariusVanDerWijden
 * @date 2017
 */
 
@@ -68,6 +68,9 @@ ethash_cuda_miner::ethash_cuda_miner()
 {
 	int devicesCount = getNumDevices();
 	m_light = new hash64_t*[devicesCount];
+	for(int i = 0; i < devicesCount; i++)
+		m_light[i] = nullptr;
+	m_dag = nullptr;
 }
 
 std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
@@ -205,7 +208,7 @@ void ethash_cuda_miner::finish()
 	CUDA_SAFE_CALL(cudaDeviceReset());
 }
 
-bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG, bool genDag)
+bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG)
 {
 	try
 	{
@@ -246,8 +249,8 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		}
 
 		if(!light){ 
-			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
 			cudalog << "Allocating light with size: " << _lightSize;
+			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
 		}
 		// copy lightData to device
 		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
@@ -261,6 +264,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		if(dagSize128 != m_dag_size || !dag)
 		{
 			// create mining buffers
+			cudalog << "Generating mining buffers";
 			for (unsigned i = 0; i != s_numStreams; ++i)
 			{
 				CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
@@ -276,7 +280,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 			if (!*hostDAG)
 			{
-				if(genDag || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
+				if(device_num == 0 || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
 					cudalog << "Generating DAG for GPU #" << device_num << " with dagSize: " 
 							<< dagSize <<" gridSize: " << s_gridSize << " &m_streams[0]: " << &m_streams[0];
 					ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], device_num);

From b37dca0780f61f35f6a2195a2a421b1ba4ab0a2b Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Fri, 8 Dec 2017 18:46:31 +0100
Subject: [PATCH 31/41] Fixed build error

---
 libethash-cuda/CUDAMiner.cpp         | 2 +-
 libethash-cuda/ethash_cuda_miner.cpp | 2 +-
 libethash-cuda/ethash_cuda_miner.h   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index e8a38fa24d..c4c1f4a1e8 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -138,7 +138,7 @@ bool CUDAMiner::init(const h256& seed)
 		bytesConstRef lightData = light->data();
 
 		m_miner->init(light->light, lightData.data(), lightData.size(), 
-			device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory, device == 0);
+			device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory);
 		s_dagLoadIndex++;
 
 		if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 077fe27556..de0b822339 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -242,7 +242,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		if(dagSize128 != m_dag_size || !dag)
 		{
 			//We need to reset the device and recreate the dag  
-			cudalog << "Resetting device: ";
+			cudalog << "Resetting device";
 			CUDA_SAFE_CALL(cudaDeviceReset());
 			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
 			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 6db52653ac..8ed2794443 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -37,7 +37,7 @@ class ethash_cuda_miner
 		);
         static void setParallelHash(unsigned _parallelHash);
 
-	bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG,bool reloadDag);
+	bool init(ethash_light_t _light, uint8_t const* _lightData, uint64_t _lightSize, unsigned _deviceId, bool _cpyToHost, volatile void** hostDAG);
 
 	void finish();
 	void search(uint8_t const* header, uint64_t target, search_hook& hook, bool _ethStratum, uint64_t _startN);

From a28bcdaf97b5890afd61d0596f39fac5658eb6cb Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 21 Dec 2017 17:56:05 +0100
Subject: [PATCH 32/41] fixed simulation

---
 libethash-cuda/CUDAMiner.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index c4c1f4a1e8..85550b3c57 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -114,7 +114,7 @@ void CUDAMiner::report(uint64_t _nonce)
 
 void CUDAMiner::kickOff()
 {
-	//m_hook->reset();
+	m_hook->reset();
 	//startWorking();
 }
 
@@ -173,7 +173,7 @@ void CUDAMiner::workLoop()
 		{
 			const WorkPackage w = work();
 			
-			if(!m_miner || current.seed != w.seed)
+			if(!m_miner || current.header != w.header)
 			{
 				if(!w || w.header == h256())
 				{
@@ -192,7 +192,7 @@ void CUDAMiner::workLoop()
 			}
 			uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)current.boundary >> 192);
 			uint64_t startN = current.startNonce;
-			if (current.exSizeBits >= 0)
+			if (current.exSizeBits >= 0) 
 				startN = current.startNonce | ((uint64_t)index << (64 - 4 - current.exSizeBits)); // this can support up to 16 devices
 			m_miner->search(current.header.data(), upper64OfBoundary, *m_hook, (current.exSizeBits >= 0), startN);
 		}
@@ -207,7 +207,7 @@ void CUDAMiner::workLoop()
 
 void CUDAMiner::pause()
 {
-	//m_hook->abort();
+	m_hook->abort();
 	//stopWorking();
 }
 

From 09eb4e38a220a20ec14414cc2ebeb1e2713f5c25 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 21 Dec 2017 18:35:53 +0100
Subject: [PATCH 33/41] rewrote guard

---
 libethash-cuda/CUDAMiner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 85550b3c57..1c425ff61e 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -173,7 +173,7 @@ void CUDAMiner::workLoop()
 		{
 			const WorkPackage w = work();
 			
-			if(!m_miner || current.header != w.header)
+			if(!m_miner || current.header != w.header || current.seed != w.seed)
 			{
 				if(!w || w.header == h256())
 				{

From 4ae0e2078dbe2c4c941372e52f7800e8e6d3cc47 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Wed, 27 Dec 2017 11:35:49 +0100
Subject: [PATCH 34/41] fixed dag generation error

---
 libethash-cuda/ethash_cuda_miner.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index de0b822339..c8a929c78b 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -233,20 +233,23 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		uint32_t dagSize128   = (unsigned)(dagSize / ETHASH_MIX_BYTES);
 		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
 
-		// create buffer for cache
-		hash64_t * light = m_light[device_num];
-		hash128_t * dag = m_dag;
+		
 		
 		CUDA_SAFE_CALL(cudaSetDevice(device_num));
 		cudalog << "Set Device to current";
-		if(dagSize128 != m_dag_size || !dag)
+		if(dagSize128 != m_dag_size || !m_dag)
 		{
 			//We need to reset the device and recreate the dag  
 			cudalog << "Resetting device";
 			CUDA_SAFE_CALL(cudaDeviceReset());
 			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
 			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+			m_light[device_num] = nullptr;
+			m_dag = nullptr;
 		}
+		// create buffer for cache
+		hash128_t * dag = m_dag;
+		hash64_t * light = m_light[device_num];
 
 		if(!light){ 
 			cudalog << "Allocating light with size: " << _lightSize;
@@ -264,7 +267,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		if(dagSize128 != m_dag_size || !dag)
 		{
 			// create mining buffers
-			cudalog << "Generating mining buffers";
+			cudalog << "Generating mining buffers"; //TODO whats up with this?
 			for (unsigned i = 0; i != s_numStreams; ++i)
 			{
 				CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
@@ -322,6 +325,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_hook& hook, bool _ethStratum, uint64_t _startN)
 {
+	cudalog << "Starting search on "<< target << " startNonce: "<<_startN;
 	bool initialize = false;
 	bool exit = false;
 	if (memcmp(&m_current_header, header, sizeof(hash32_t)))

From 6a251e67b4bb9d27980a6118cf7c5935bc8dd487 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 28 Dec 2017 01:54:33 +0100
Subject: [PATCH 35/41] removed unnecessary print

---
 libethash-cuda/ethash_cuda_miner.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index c8a929c78b..85ab826bb2 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -325,7 +325,6 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 void ethash_cuda_miner::search(uint8_t const* header, uint64_t target, search_hook& hook, bool _ethStratum, uint64_t _startN)
 {
-	cudalog << "Starting search on "<< target << " startNonce: "<<_startN;
 	bool initialize = false;
 	bool exit = false;
 	if (memcmp(&m_current_header, header, sizeof(hash32_t)))

From 7c0aecf311e1b9465dfcf3910215232d45732ba6 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 28 Dec 2017 02:21:19 +0100
Subject: [PATCH 36/41] fixed merge error

---
 libethash-cuda/CUDAMiner.cpp         | 2 +-
 libethash-cuda/ethash_cuda_miner.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 22741816fd..1fae0c35b5 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -138,7 +138,7 @@ bool CUDAMiner::init(const h256& seed)
 		bytesConstRef lightData = light->data();
 
 		m_miner->init(light->light, lightData.data(), lightData.size(), 
-			device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), &s_dagInHostMemory);
+			device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), s_dagInHostMemory);
 		s_dagLoadIndex++;
     
 		if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 5fdebd5503..b7981e4605 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -280,7 +280,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 			m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
 
-			if (!*hostDAG)
+			if (!hostDAG)
 			{
 				if(device_num == 0 || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
 					cudalog << "Generating DAG for GPU #" << device_num << " with dagSize: " 
@@ -293,10 +293,10 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 						cudalog << "Copying DAG from GPU #" << device_num << " to host";
 						CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
 
-						*hostDAG = (void*)memoryDAG;
+						hostDAG = memoryDAG;
 					}
 				}else{
-					while(!*hostDAG)
+					while(!hostDAG)
 						this_thread::sleep_for(chrono::milliseconds(100)); 
 					goto cpyDag;
 				}
@@ -305,7 +305,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			{
 cpyDag:
 				cudalog << "Copying DAG from host to GPU #" << device_num;
-				const void* hdag = (const void*)(*hostDAG);
+				const void* hdag = (const void*)hostDAG;
 				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
 			}
 		}

From c77a0ca8e54f2be27a37925135557f09f08d9be3 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 28 Dec 2017 09:47:39 +0100
Subject: [PATCH 37/41] removed unnecessary cnote

---
 libethash-cuda/CUDAMiner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 1fae0c35b5..3f1ccf5197 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -182,7 +182,7 @@ void CUDAMiner::workLoop()
 					continue;
 				}
 				
-				cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
+				//cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
 				if (!m_miner || current.seed != w.seed)
 				{
 					if(!init(w.seed))

From 8aba73b9fb467d3314c0648fb73de263eacfeea2 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 28 Dec 2017 12:10:22 +0100
Subject: [PATCH 38/41] changed requested changes

---
 libethash-cuda/CUDAMiner.cpp         | 13 ++++++-------
 libethash-cuda/ethash_cuda_miner.cpp | 12 ++++++++----
 libethash-cuda/ethash_cuda_miner.h   |  7 ++++---
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 3f1ccf5197..e716cea624 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -17,8 +17,6 @@ along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
 /** @file CUDAMiner.cpp
 * @author Gav Wood <i@gavwood.com>
 * @date 2014
-* @author MariusVanDerWijden
-* @date 2017
 *
 * Determines the PoW algorithm.
 */
@@ -115,7 +113,6 @@ void CUDAMiner::report(uint64_t _nonce)
 void CUDAMiner::kickOff()
 {
 	m_hook->reset();
-	//startWorking();
 }
 
 bool CUDAMiner::init(const h256& seed)
@@ -127,14 +124,11 @@ bool CUDAMiner::init(const h256& seed)
 		cnote << "Initialising miner...";
 		m_minerSeed = seed;
 
-		//delete m_miner;
-		//m_miner = new ethash_cuda_miner;
 		if(!m_miner)
 			m_miner = new ethash_cuda_miner;
 
 		EthashAux::LightType light;
 		light = EthashAux::light(seed);
-		//bytesConstRef dagData = dag->data();
 		bytesConstRef lightData = light->data();
 
 		m_miner->init(light->light, lightData.data(), lightData.size(), 
@@ -195,6 +189,12 @@ void CUDAMiner::workLoop()
 			if (current.exSizeBits >= 0) 
 				startN = current.startNonce | ((uint64_t)index << (64 - 4 - current.exSizeBits)); // this can support up to 16 devices
 			m_miner->search(current.header.data(), upper64OfBoundary, *m_hook, (current.exSizeBits >= 0), startN);
+			
+			// Check if we should stop.
+			if (shouldStop())
+			{
+				break;
+			}
 		}
 	}
 	catch (std::runtime_error const& _e)
@@ -208,7 +208,6 @@ void CUDAMiner::workLoop()
 void CUDAMiner::pause()
 {
 	m_hook->abort();
-	//stopWorking();
 }
 
 std::string CUDAMiner::platformInfo()
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index b7981e4605..4852b07237 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -17,8 +17,6 @@
 /** @file ethash_cuda_miner.cpp
 * @author Genoil <jw@meneer.net>
 * @date 2015
-* @author MariusVanDerWijden
-* @date 2017
 */
 
 
@@ -69,7 +67,11 @@ ethash_cuda_miner::ethash_cuda_miner()
 	m_light = new hash64_t*[devicesCount];
 	for(int i = 0; i < devicesCount; i++)
 		m_light[i] = nullptr;
-	m_dag = nullptr;
+}
+
+ethash_cuda_miner::~ethash_cuda_miner()
+{
+	delete[] m_light;
 }
 
 std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
@@ -243,8 +245,10 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			CUDA_SAFE_CALL(cudaDeviceReset());
 			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
 			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+			//We need to reset the light and the Dag for the following code to reallocate
+			//since cudaDeviceReset() free's all previous allocated memory
 			m_light[device_num] = nullptr;
-			m_dag = nullptr;
+			m_dag = nullptr; 
 		}
 		// create buffer for cache
 		hash128_t * dag = m_dag;
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 468ac28a28..43ac29398d 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -23,6 +23,7 @@ class ethash_cuda_miner
 
 public:
 	ethash_cuda_miner();
+	~ethash_cuda_miner();
 
 	static std::string platform_info(unsigned _deviceId = 0);
 	static int getNumDevices();
@@ -60,9 +61,9 @@ class ethash_cuda_miner
 	uint32_t m_sharedBytes;
 	
 	///Constants on GPU
-	hash128_t* m_dag;
-	hash64_t ** m_light;
-	uint32_t m_dag_size;
+	hash128_t* m_dag = nullptr;
+	hash64_t ** m_light = nullptr;
+	uint32_t m_dag_size = -1;
 	int m_device_num;
 	
 	

From a8facaf0314cdea9ac79408b1ae8f1cda0e5b98f Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 28 Dec 2017 18:33:42 +0100
Subject: [PATCH 39/41] changed hwmon breaking bug

---
 libethash-cuda/ethash_cuda_miner.cpp | 22 +++++++++++-----------
 libethash-cuda/ethash_cuda_miner.h   |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 4852b07237..ad0bb06a44 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -219,11 +219,11 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			return false;
 
 		// use selected device
-		unsigned device_num = _deviceId < device_count -1 ? _deviceId : device_count - 1;
+		m_device_num = _deviceId < device_count -1 ? _deviceId : device_count - 1;
 		nvmlh = wrap_nvml_create();
 
 		cudaDeviceProp device_props;
-		CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, device_num));
+		CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, m_device_num));
 
 		cudalog << "Using device: " << device_props.name << " (Compute " + to_string(device_props.major) + "." + to_string(device_props.minor) + ")";
 
@@ -236,7 +236,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 		
 		
-		CUDA_SAFE_CALL(cudaSetDevice(device_num));
+		CUDA_SAFE_CALL(cudaSetDevice(m_device_num));
 		cudalog << "Set Device to current";
 		if(dagSize128 != m_dag_size || !m_dag)
 		{
@@ -247,12 +247,12 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
 			//We need to reset the light and the Dag for the following code to reallocate
 			//since cudaDeviceReset() free's all previous allocated memory
-			m_light[device_num] = nullptr;
+			m_light[m_device_num] = nullptr;
 			m_dag = nullptr; 
 		}
 		// create buffer for cache
 		hash128_t * dag = m_dag;
-		hash64_t * light = m_light[device_num];
+		hash64_t * light = m_light[m_device_num];
 
 		if(!light){ 
 			cudalog << "Allocating light with size: " << _lightSize;
@@ -260,7 +260,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		}
 		// copy lightData to device
 		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
-		m_light[device_num] = light;
+		m_light[m_device_num] = light;
 		
 		if(dagSize128 != m_dag_size || !dag) // create buffer for dag
 			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
@@ -286,15 +286,15 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 			if (!hostDAG)
 			{
-				if(device_num == 0 || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
-					cudalog << "Generating DAG for GPU #" << device_num << " with dagSize: " 
+				if(m_device_num == 0 || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
+					cudalog << "Generating DAG for GPU #" << m_device_num << " with dagSize: " 
 							<< dagSize <<" gridSize: " << s_gridSize << " &m_streams[0]: " << &m_streams[0];
-					ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], device_num);
+					ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], m_device_num);
 
 					if (_cpyToHost)
 					{
 						uint8_t* memoryDAG = new uint8_t[dagSize];
-						cudalog << "Copying DAG from GPU #" << device_num << " to host";
+						cudalog << "Copying DAG from GPU #" << m_device_num << " to host";
 						CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
 
 						hostDAG = memoryDAG;
@@ -308,7 +308,7 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 			else
 			{
 cpyDag:
-				cudalog << "Copying DAG from host to GPU #" << device_num;
+				cudalog << "Copying DAG from host to GPU #" << m_device_num;
 				const void* hdag = (const void*)hostDAG;
 				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
 			}
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 43ac29398d..95f34d8496 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -64,7 +64,7 @@ class ethash_cuda_miner
 	hash128_t* m_dag = nullptr;
 	hash64_t ** m_light = nullptr;
 	uint32_t m_dag_size = -1;
-	int m_device_num;
+	uint32_t m_device_num;
 	
 	
 

From 5a42eb74483304e908999ed0674769685b77b7ec Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 28 Dec 2017 18:48:42 +0100
Subject: [PATCH 40/41] use std::vector instead of raw array

---
 libethash-cuda/ethash_cuda_miner.cpp | 11 +++--------
 libethash-cuda/ethash_cuda_miner.h   |  3 +--
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index ad0bb06a44..283898bad1 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -63,17 +63,12 @@ ethash_cuda_miner::search_hook::~search_hook() {}
 
 ethash_cuda_miner::ethash_cuda_miner()
 {
-	int devicesCount = getNumDevices();
-	m_light = new hash64_t*[devicesCount];
-	for(int i = 0; i < devicesCount; i++)
+	int deviceCount = getNumDevices();
+	m_light = std::vector<hash64_t*>(deviceCount);
+	for(int i = 0; i < deviceCount; i++)
 		m_light[i] = nullptr;
 }
 
-ethash_cuda_miner::~ethash_cuda_miner()
-{
-	delete[] m_light;
-}
-
 std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
 {
 	int runtime_version;
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 95f34d8496..e924e8de60 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -23,7 +23,6 @@ class ethash_cuda_miner
 
 public:
 	ethash_cuda_miner();
-	~ethash_cuda_miner();
 
 	static std::string platform_info(unsigned _deviceId = 0);
 	static int getNumDevices();
@@ -62,7 +61,7 @@ class ethash_cuda_miner
 	
 	///Constants on GPU
 	hash128_t* m_dag = nullptr;
-	hash64_t ** m_light = nullptr;
+	std::vector<hash64_t*> m_light;
 	uint32_t m_dag_size = -1;
 	uint32_t m_device_num;
 	

From 6b001373fb572d1cc6490c2d4668c457fc055735 Mon Sep 17 00:00:00 2001
From: Marius van der Wijden <m.vanderwijden@live.de>
Date: Thu, 28 Dec 2017 19:04:37 +0100
Subject: [PATCH 41/41] fixed init of m_light

---
 libethash-cuda/ethash_cuda_miner.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index 283898bad1..162287c590 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -63,10 +63,7 @@ ethash_cuda_miner::search_hook::~search_hook() {}
 
 ethash_cuda_miner::ethash_cuda_miner()
 {
-	int deviceCount = getNumDevices();
-	m_light = std::vector<hash64_t*>(deviceCount);
-	for(int i = 0; i < deviceCount; i++)
-		m_light[i] = nullptr;
+	m_light.resize(getNumDevices());
 }
 
 std::string ethash_cuda_miner::platform_info(unsigned _deviceId)