Bugfix: wrong order on gpu

This commit is contained in:
Laurent Clouet
2018-05-03 13:35:57 +02:00
parent 530d4523af
commit 47da749efc
3 changed files with 75 additions and 1 deletions

View File

@@ -33,6 +33,11 @@ public interface CUDA extends Library {
public int cuDeviceGetName(byte[] name, int len, int dev); public int cuDeviceGetName(byte[] name, int len, int dev);
public int cuDeviceGet (IntByReference device, int ordinal);
public int cuDeviceGetAttribute (IntByReference pi, int attrib, int dev );
public int cuDeviceTotalMem_v2(LongByReference bytes, int dev); public int cuDeviceTotalMem_v2(LongByReference bytes, int dev);
public int cuDeviceTotalMem(LongByReference bytes, int dev); public int cuDeviceTotalMem(LongByReference bytes, int dev);
} }

View File

@@ -0,0 +1,41 @@
/*
* Copyright (C) 2018 Laurent CLOUET
* Author Laurent CLOUET <laurent.clouet@nopnop.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package com.sheepit.client.hardware.gpu;
/**
* CUDA Device properties. Taken directly from the online manual:
* https://docs.nvidia.com/cuda/cuda-driver-api
*/
public class CUDeviceAttribute {
/**
* PCI bus ID of the device
*/
public static final int CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33;
/**
* PCI device ID of the device
*/
public static final int CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34;
/**
* PCI domain ID of the device
*/
public static final int CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50;
}

View File

@@ -19,6 +19,7 @@
package com.sheepit.client.hardware.gpu; package com.sheepit.client.hardware.gpu;
import java.util.HashMap;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
@@ -81,7 +82,23 @@ public class GPU {
return false; return false;
} }
HashMap<Integer, GPUDevice> devicesWithPciId = new HashMap<Integer, GPUDevice>(count.getValue());
for (int num = 0; num < count.getValue(); num++) { for (int num = 0; num < count.getValue(); num++) {
IntByReference aDevice = new IntByReference();
result = cudalib.cuDeviceGet(aDevice, num);
if (result != CUresult.CUDA_SUCCESS) {
System.out.println("GPU::generate cuDeviceGet failed (ret: " + CUresult.stringFor(result) + ")");
continue;
}
IntByReference pciBusId = new IntByReference();
result = cudalib.cuDeviceGetAttribute(pciBusId, CUDeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, aDevice.getValue());
if (result != CUresult.CUDA_SUCCESS) {
System.out.println("GPU::generate cuDeviceGetAttribute for CU_DEVICE_ATTRIBUTE_PCI_BUS_ID failed (ret: " + CUresult.stringFor(result) + ")");
continue;
}
byte name[] = new byte[256]; byte name[] = new byte[256];
result = cudalib.cuDeviceGetName(name, 256, num); result = cudalib.cuDeviceGetName(name, 256, num);
@@ -104,8 +121,19 @@ public class GPU {
return false; return false;
} }
devices.add(new GPUDevice(new String(name).trim(), ram.getValue(), "CUDA_" + Integer.toString(num))); devicesWithPciId.put(pciBusId.getValue(), new GPUDevice(new String(name).trim(), ram.getValue(), "FAKE"));
} }
// generate proper cuda id
// in theory a set to environment "CUDA_DEVICE_ORDER=PCI_BUS_ID" should be enough but it didn't work
int i = 0;
for (HashMap.Entry<Integer, GPUDevice> entry : devicesWithPciId.entrySet()){
GPUDevice aDevice = entry.getValue();
aDevice.setCudaName("CUDA_" + Integer.toString(i));
devices.add(aDevice);
i++;
}
return true; return true;
} }