diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
index 5a315d9c54..e716cea624 100644
--- a/libethash-cuda/CUDAMiner.cpp
+++ b/libethash-cuda/CUDAMiner.cpp
@@ -113,75 +113,89 @@ void CUDAMiner::report(uint64_t _nonce)
 void CUDAMiner::kickOff()
 {
 	m_hook->reset();
-	startWorking();
 }
 
-void CUDAMiner::workLoop()
+bool CUDAMiner::init(const h256& seed)
 {
 	// take local copy of work since it may end up being overwritten by kickOff/pause.
 	try {
-		WorkPackage w = work();
-		if (!w)
-			return;
+		unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
 
-		if (!m_miner || m_minerSeed != w.seed)
-		{
-			unsigned device = s_devices[index] > -1 ? s_devices[index] : index;
+		cnote << "Initialising miner...";
+		m_minerSeed = seed;
+
+		if(!m_miner)
+			m_miner = new ethash_cuda_miner;
 
-			if (s_dagLoadMode == DAG_LOAD_MODE_SEQUENTIAL)
+		EthashAux::LightType light;
+		light = EthashAux::light(seed);
+		bytesConstRef lightData = light->data();
+
+		m_miner->init(light->light, lightData.data(), lightData.size(), 
+			device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), s_dagInHostMemory);
+		s_dagLoadIndex++;
+    
+		if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+		{
+			if (s_dagLoadIndex >= s_numInstances && s_dagInHostMemory)
 			{
-				while (s_dagLoadIndex < index) {
-					this_thread::sleep_for(chrono::seconds(1));
-				}
+				// all devices have loaded DAG, we can free now
+				delete[] s_dagInHostMemory;
+				s_dagInHostMemory = NULL;
+				cout << "Freeing DAG from host" << endl;
 			}
-			else if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+		}
+		return true;
+	}
+	catch (std::runtime_error const& _e)
+	{
+		delete m_miner;
+		m_miner = nullptr;
+		cwarn << "Error CUDA mining: " << _e.what();
+		return false;
+	}
+}
+
+void CUDAMiner::workLoop()
+{
+	WorkPackage current;
+	current.header = h256{1u};
+	current.seed = h256{1u};
+	try
+	{
+		while(true)
+		{
+			const WorkPackage w = work();
+			
+			if(!m_miner || current.header != w.header || current.seed != w.seed)
 			{
-				if (device != s_dagCreateDevice)
+				if(!w || w.header == h256())
 				{
-					// wait until DAG is created on selected device
-					while (s_dagInHostMemory == NULL) {
-						this_thread::sleep_for(chrono::seconds(1));
-					}
+					cnote << "No work. Pause for 3 s.";
+					std::this_thread::sleep_for(std::chrono::seconds(3));
+					continue;
 				}
-				else
+				
+				//cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
+				if (!m_miner || current.seed != w.seed)
 				{
-					// reset load index
-					s_dagLoadIndex = 0;
+					if(!init(w.seed))
+						break;
 				}
+				current = w;
 			}
-
-			cnote << "Initialising miner...";
-			m_minerSeed = w.seed;
-
-			delete m_miner;
-			m_miner = new ethash_cuda_miner;
-
-			EthashAux::LightType light;
-			light = EthashAux::light(w.seed);
-			//bytesConstRef dagData = dag->data();
-			bytesConstRef lightData = light->data();
-
-			m_miner->init(light->light, lightData.data(), lightData.size(), device, (s_dagLoadMode == DAG_LOAD_MODE_SINGLE), s_dagInHostMemory);
-			s_dagLoadIndex++;
-
-			if (s_dagLoadMode == DAG_LOAD_MODE_SINGLE)
+			uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)current.boundary >> 192);
+			uint64_t startN = current.startNonce;
+			if (current.exSizeBits >= 0) 
+				startN = current.startNonce | ((uint64_t)index << (64 - 4 - current.exSizeBits)); // this can support up to 16 devices
+			m_miner->search(current.header.data(), upper64OfBoundary, *m_hook, (current.exSizeBits >= 0), startN);
+			
+			// Check if we should stop.
+			if (shouldStop())
 			{
-				if (s_dagLoadIndex >= s_numInstances && s_dagInHostMemory)
-				{
-					// all devices have loaded DAG, we can free now
-					delete[] s_dagInHostMemory;
-					s_dagInHostMemory = NULL;
-
-					cout << "Freeing DAG from host" << endl;
-				}
+				break;
 			}
 		}
-
-		uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)w.boundary >> 192);
-		uint64_t startN = w.startNonce;
-		if (w.exSizeBits >= 0)
-			startN = w.startNonce | ((uint64_t)index << (64 - 4 - w.exSizeBits)); // this can support up to 16 devices
-		m_miner->search(w.header.data(), upper64OfBoundary, *m_hook, (w.exSizeBits >= 0), startN);
 	}
 	catch (std::runtime_error const& _e)
 	{
@@ -194,7 +208,6 @@ void CUDAMiner::workLoop()
 void CUDAMiner::pause()
 {
 	m_hook->abort();
-	stopWorking();
 }
 
 std::string CUDAMiner::platformInfo()
diff --git a/libethash-cuda/CUDAMiner.h b/libethash-cuda/CUDAMiner.h
index 690a3ad47f..3e0fce5fd2 100644
--- a/libethash-cuda/CUDAMiner.h
+++ b/libethash-cuda/CUDAMiner.h
@@ -76,6 +76,9 @@ class EthashCUDAHook;
 	private:
 		void workLoop() override;
 		void report(uint64_t _nonce);
+		void initDevice(WorkPackage w);
+
+		bool init(const h256& seed);
 
 		EthashCUDAHook* m_hook = nullptr;
 		ethash_cuda_miner* m_miner = nullptr;
diff --git a/libethash-cuda/ethash_cuda_miner.cpp b/libethash-cuda/ethash_cuda_miner.cpp
index d431a4cf72..162287c590 100644
--- a/libethash-cuda/ethash_cuda_miner.cpp
+++ b/libethash-cuda/ethash_cuda_miner.cpp
@@ -63,6 +63,7 @@ ethash_cuda_miner::search_hook::~search_hook() {}
 
 ethash_cuda_miner::ethash_cuda_miner()
 {
+	m_light.resize(getNumDevices());
 }
 
 std::string ethash_cuda_miner::platform_info(unsigned _deviceId)
@@ -204,14 +205,13 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 {
 	try
 	{
-		int device_count = getNumDevices();
+		unsigned device_count = getNumDevices();
 
 		if (device_count == 0)
 			return false;
 
 		// use selected device
-		m_device_num = std::min<int>((int)_deviceId, device_count - 1);
-
+		m_device_num = _deviceId < device_count -1 ? _deviceId : device_count - 1;
 		nvmlh = wrap_nvml_create();
 
 		cudaDeviceProp device_props;
@@ -219,11 +219,6 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 
 		cudalog << "Using device: " << device_props.name << " (Compute " + to_string(device_props.major) + "." + to_string(device_props.minor) + ")";
 
-		CUDA_SAFE_CALL(cudaSetDevice(m_device_num));
-		CUDA_SAFE_CALL(cudaDeviceReset());
-		CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
-		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
-
 		m_search_buf = new volatile uint32_t *[s_numStreams];
 		m_streams = new cudaStream_t[s_numStreams];
 
@@ -231,55 +226,88 @@ bool ethash_cuda_miner::init(ethash_light_t _light, uint8_t const* _lightData, u
 		uint32_t dagSize128   = (unsigned)(dagSize / ETHASH_MIX_BYTES);
 		uint32_t lightSize64 = (unsigned)(_lightSize / sizeof(node));
 
+		
+		
+		CUDA_SAFE_CALL(cudaSetDevice(m_device_num));
+		cudalog << "Set Device to current";
+		if(dagSize128 != m_dag_size || !m_dag)
+		{
+			//We need to reset the device and recreate the dag  
+			cudalog << "Resetting device";
+			CUDA_SAFE_CALL(cudaDeviceReset());
+			CUDA_SAFE_CALL(cudaSetDeviceFlags(s_scheduleFlag));
+			CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+			//We need to reset the light and the Dag for the following code to reallocate
+			//since cudaDeviceReset() free's all previous allocated memory
+			m_light[m_device_num] = nullptr;
+			m_dag = nullptr; 
+		}
 		// create buffer for cache
-		hash64_t * light = NULL;
+		hash128_t * dag = m_dag;
+		hash64_t * light = m_light[m_device_num];
 
-		if (!hostDAG)
-		{
+		if(!light){ 
+			cudalog << "Allocating light with size: " << _lightSize;
 			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), _lightSize));
-			// copy dag cache to CPU.
-			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
 		}
-
-		// create buffer for dag
-		hash128_t * dag;
-		CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
+		// copy lightData to device
+		CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), _lightData, _lightSize, cudaMemcpyHostToDevice));
+		m_light[m_device_num] = light;
 		
-		// create mining buffers
-		for (unsigned i = 0; i != s_numStreams; ++i)
+		if(dagSize128 != m_dag_size || !dag) // create buffer for dag
+			CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), dagSize));
+			
+		set_constants(dag, dagSize128, light, lightSize64); //in ethash_cuda_miner_kernel.cu
+		
+		if(dagSize128 != m_dag_size || !dag)
 		{
-			CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
-			CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
-		}
-		set_constants(dag, dagSize128, light, lightSize64);
-		memset(&m_current_header, 0, sizeof(hash32_t));
-		m_current_target = 0;
-		m_current_nonce = 0;
-		m_current_index = 0;
-
-		m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
+			// create mining buffers
+			cudalog << "Generating mining buffers"; //TODO whats up with this?
+			for (unsigned i = 0; i != s_numStreams; ++i)
+			{
+				CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
+				CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
+			}
+			
+			memset(&m_current_header, 0, sizeof(hash32_t));
+			m_current_target = 0;
+			m_current_nonce = 0;
+			m_current_index = 0;
 
-		if (!hostDAG)
-		{
-			cudalog << "Generating DAG for GPU #" << m_device_num;
-			ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], m_device_num);
+			m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
 
-			if (_cpyToHost)
+			if (!hostDAG)
 			{
-				uint8_t* memoryDAG = new uint8_t[dagSize];
-				cudalog << "Copying DAG from GPU #" << m_device_num << " to host";
-				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
-
-				hostDAG = memoryDAG;
+				if(m_device_num == 0 || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
+					cudalog << "Generating DAG for GPU #" << m_device_num << " with dagSize: " 
+							<< dagSize <<" gridSize: " << s_gridSize << " &m_streams[0]: " << &m_streams[0];
+					ethash_generate_dag(dagSize, s_gridSize, s_blockSize, m_streams[0], m_device_num);
+
+					if (_cpyToHost)
+					{
+						uint8_t* memoryDAG = new uint8_t[dagSize];
+						cudalog << "Copying DAG from GPU #" << m_device_num << " to host";
+						CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(memoryDAG), dag, dagSize, cudaMemcpyDeviceToHost));
+
+						hostDAG = memoryDAG;
+					}
+				}else{
+					while(!hostDAG)
+						this_thread::sleep_for(chrono::milliseconds(100)); 
+					goto cpyDag;
+				}
+			}
+			else
+			{
+cpyDag:
+				cudalog << "Copying DAG from host to GPU #" << m_device_num;
+				const void* hdag = (const void*)hostDAG;
+				CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
 			}
 		}
-		else
-		{
-			cudalog << "Copying DAG from host to GPU #" << m_device_num;
-			const void* hdag = (const void*)hostDAG;
-			CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dag), hdag, dagSize, cudaMemcpyHostToDevice));
-		}
-
+    
+		m_dag = dag;
+		m_dag_size = dagSize128;
 		return true;
 	}
 	catch (runtime_error const&)
diff --git a/libethash-cuda/ethash_cuda_miner.h b/libethash-cuda/ethash_cuda_miner.h
index 6defad3388..e924e8de60 100644
--- a/libethash-cuda/ethash_cuda_miner.h
+++ b/libethash-cuda/ethash_cuda_miner.h
@@ -58,7 +58,14 @@ class ethash_cuda_miner
 	uint64_t m_starting_nonce;
 	uint64_t m_current_index;
 	uint32_t m_sharedBytes;
-	int m_device_num;
+	
+	///Constants on GPU
+	hash128_t* m_dag = nullptr;
+	std::vector<hash64_t*> m_light;
+	uint32_t m_dag_size = -1;
+	uint32_t m_device_num;
+	
+	
 
 	volatile uint32_t ** m_search_buf;
 	cudaStream_t  * m_streams;
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
index d25cc14abe..5ae6f89649 100644
--- a/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -104,8 +104,9 @@ ethash_calculate_dag_item(uint32_t start)
 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
 				}
 			}
-
 		}
+
+
 #endif		
 	}
 	SHA3_512(dag_node.uint2s);