beehive-lab
diff --git a/‎.gitmodules‎
Lines changed: 2 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 0 additions & 6 deletions b/‎README.md‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎external/tornadovm‎ b/‎external/tornadovm‎
diff --git a/‎src/main/java/com/example/loader/weights/ModelLoader.java‎
Lines changed: 63 additions & 60 deletions b/‎src/main/java/com/example/loader/weights/ModelLoader.java‎
Lines changed: 63 additions & 60 deletions
@@ -1,3 +1,5 @@
 [submodule "external/tornadovm"]
 	path = external/tornadovm
 	url = https://github.com/beehive-lab/TornadoVM.git
+	branch = master
+
@@ -529,12 +529,6 @@ The secret sauce that transforms regular Java code into GPU-accelerated compute
 -----------
 
 
-## Early performance of v1.0
-
-![GPULlama3.java Performance Comparison](./docs/performance.png)
-
------------
-
 ## License
 
 
 
@@ -9,18 +9,18 @@
 import com.example.core.model.tensor.GGMLTensorEntry;
 import com.example.core.model.tensor.Q4_0FloatTensor;
 import com.example.core.model.tensor.Q8_0FloatTensor;
-import com.example.core.types.Float16;
 import com.example.core.types.Pair;
 import com.example.inference.engine.impl.Configuration;
 import com.example.inference.engine.impl.Llama;
 import com.example.inference.operation.RoPE;
 import com.example.tokenizer.impl.Tokenizer;
 import com.example.tokenizer.vocabulary.Vocabulary;
+import uk.ac.manchester.tornado.api.types.HalfFloat;
 import uk.ac.manchester.tornado.api.types.arrays.ByteArray;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
+import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
 
 import java.io.IOException;
-import java.lang.foreign.MemorySegment;
 import java.nio.ByteOrder;
 import java.nio.FloatBuffer;
 import java.nio.channels.FileChannel;
@@ -33,9 +33,6 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
-import static com.example.core.model.tensor.FloatTensor.readByte;
-import static com.example.core.model.tensor.FloatTensor.readShort;
-
 public final class ModelLoader {
     private static final String TOKENIZER_LLAMA_3_MODEL = "gpt2";
 
@@ -104,15 +101,15 @@ private static Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tenso
         return new Weights(
                 // Load directly to TornadoVM format
                 loadTensorAsFloatArray(tokenEmbeddings), loadArrayAsFloatArrayFromBuffer(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")),
-                loadArrayAsFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_q.weight")),
-                loadArrayAsFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_k.weight")),
-                loadArrayAsFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_v.weight")),
-                loadArrayAsFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_output.weight")),
+                loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_q.weight")),
+                loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_k.weight")),
+                loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_v.weight")),
+                loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_output.weight")),
                 loadArrayAsFloatArrayFromBuffer(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_norm.weight")),
-                loadArrayAsFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_gate.weight")),
-                loadArrayAsFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_down.weight")),
-                loadArrayAsFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_up.weight")), floatBufferToFloatArray(tensorEntries.get("output_norm.weight")),
-                FloatArray.fromArray(ropeFreqs.first()), FloatArray.fromArray(ropeFreqs.second()), createByteArrayFromTensor(outputWeight), outputWeight.ggmlType());
+                loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_gate.weight")),
+                loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_down.weight")),
+                loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_up.weight")), floatBufferToFloatArray(tensorEntries.get("output_norm.weight")),
+                FloatArray.fromArray(ropeFreqs.first()), FloatArray.fromArray(ropeFreqs.second()), loadTensorAsHalfFloatArray(outputWeight), outputWeight.ggmlType());
     }
 
     /**
@@ -132,15 +129,51 @@ private static Weights createStandardWeights(Map<String, GGMLTensorEntry> tensor
                 FloatBuffer.wrap(ropeFreqs.first()), FloatBuffer.wrap(ropeFreqs.second()), loadQuantized(outputWeight), outputWeight.ggmlType());
     }
 
-    private static FloatArray[] loadArrayAsFloatArray(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
+    private static Tokenizer createTokenizer(Map<String, Object> metadata, Vocabulary vocabulary) {
+        String[] mergeLines = (String[]) metadata.get("tokenizer.ggml.merges");
+        List<Pair<Integer, Integer>> merges = Arrays.stream(mergeLines).map(line -> line.split(" "))
+                .map(parts -> new Pair<>(vocabulary.getIndex(parts[0]).orElseThrow(), vocabulary.getIndex(parts[1]).orElseThrow())).toList();
+
+        int allTokens = vocabulary.size();
+        int baseTokens = 128000; // assume all tokens after the base ones are special.
+        int reservedSpecialTokens = allTokens - baseTokens;
+        List<String> specialTokensList = Arrays.stream(vocabulary.tokens(), baseTokens, allTokens).toList();
+
+        assert specialTokensList.stream().allMatch(token -> vocabulary.getIndex(token).isPresent());
+
+        Map<String, Integer> specialTokens = IntStream.range(0, specialTokensList.size()).boxed().collect(Collectors.toMap(i -> specialTokensList.get(i), i -> baseTokens + i));
+
+        return new Tokenizer(vocabulary, merges, LLAMA_3_PATTERN, specialTokens);
+    }
+
+    public static FloatTensor loadQuantized(GGMLTensorEntry entry) {
+        GGMLType ggmlType = entry.ggmlType();
+        return switch (ggmlType) {
+            //            case F32 -> new F32FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            case Q8_0 -> new Q8_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            case Q4_0 -> new Q4_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            case F16 -> new F16FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            default -> throw new UnsupportedOperationException("Quantization format " + ggmlType);
+        };
+    }
+
+    public static FloatArray[] loadArrayAsFloatArray(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
         FloatArray[] array = new FloatArray[size];
         for (int i = 0; i < size; i++) {
             array[i] = loadTensorAsFloatArray(getTensorEntry.apply(i));
         }
         return array;
     }
 
-    private static FloatArray floatBufferToFloatArray(GGMLTensorEntry tensorEntry) {
+    public static HalfFloatArray[] loadArrayAsHalfFloatArray(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
+        HalfFloatArray[] array = new HalfFloatArray[size];
+        for (int i = 0; i < size; i++) {
+            array[i] = loadTensorAsHalfFloatArray(getTensorEntry.apply(i));
+        }
+        return array;
+    }
+
+    public static FloatArray floatBufferToFloatArray(GGMLTensorEntry tensorEntry) {
         if (tensorEntry.ggmlType() == GGMLType.F32) {
             FloatBuffer buffer = tensorEntry.memorySegment().asByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
             return FloatArray.fromFloatBuffer(buffer);
@@ -149,20 +182,20 @@ private static FloatArray floatBufferToFloatArray(GGMLTensorEntry tensorEntry) {
         }
     }
 
-    private static FloatArray[] loadArrayAsFloatArrayFromBuffer(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
+    public static FloatArray[] loadArrayAsFloatArrayFromBuffer(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
         FloatArray[] array = new FloatArray[size];
         for (int i = 0; i < size; i++) {
             array[i] = floatBufferToFloatArray(getTensorEntry.apply(i));
         }
         return array;
     }
 
-    private static ByteArray createByteArrayFromTensor(GGMLTensorEntry entry) {
+    public static ByteArray createByteArrayFromTensor(GGMLTensorEntry entry) {
         FloatTensor tensor = loadQuantized(entry);
         return ByteArray.fromSegment(tensor.asMemorySegment());
     }
 
-    private static FloatArray loadTensorAsFloatArray(GGMLTensorEntry entry) {
+    public static FloatArray loadTensorAsFloatArray(GGMLTensorEntry entry) {
         if (entry.ggmlType() == GGMLType.F32) {
             // For F32, we can directly create FloatArray from memory
             FloatBuffer buffer = entry.memorySegment().asByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
@@ -182,50 +215,20 @@ private static FloatArray loadTensorAsFloatArray(GGMLTensorEntry entry) {
         }
     }
 
-    public static float getFloat(int index, int size, MemorySegment memorySegment) {
-        assert 0 <= index && index < size;
-        int blockIndex = index / GGMLType.Q4_0.getBlockSize();
-        int blockOffset = blockIndex * GGMLType.Q4_0.getTypeSize();
-        float scale = Float.float16ToFloat(readShort(memorySegment, blockOffset));
-        byte quant;
-        int modIndex = index % GGMLType.Q4_0.getBlockSize();
-        if (modIndex < GGMLType.Q4_0.getBlockSize() / 2) {
-            quant = (byte) (readByte(memorySegment, blockOffset + Float16.BYTES + modIndex) & 0x0F);
+    public static HalfFloatArray loadTensorAsHalfFloatArray(GGMLTensorEntry entry) {
+        if (entry.ggmlType() == GGMLType.F32) {
+            System.out.println("Loading F32 tensor as HalfFloatArray");
+            return null;
         } else {
-            quant = (byte) ((readByte(memorySegment, blockOffset + Float16.BYTES + modIndex - GGMLType.Q4_0.getBlockSize() / 2) >>> 4) & 0x0F);
+            // For quantized formats, we need to load through FloatTensor
+            FloatTensor tensor = loadQuantized(entry);
+            HalfFloatArray array = new HalfFloatArray(tensor.size());
+            for (int i = 0; i < tensor.size(); i++) {
+                HalfFloat x = new HalfFloat(tensor.getFloat(i));
+                array.set(i, x);
+            }
+            return array;
         }
-        quant -= 8;
-        return quant * scale;
-    }
-
-    private static Tokenizer createTokenizer(Map<String, Object> metadata, Vocabulary vocabulary) {
-        String[] mergeLines = (String[]) metadata.get("tokenizer.ggml.merges");
-        List<Pair<Integer, Integer>> merges = Arrays.stream(mergeLines).map(line -> line.split(" "))
-                .map(parts -> new Pair<>(vocabulary.getIndex(parts[0]).orElseThrow(), vocabulary.getIndex(parts[1]).orElseThrow())).toList();
-
-        int allTokens = vocabulary.size();
-        int baseTokens = 128000; // assume all tokens after the base ones are special.
-        int reservedSpecialTokens = allTokens - baseTokens;
-        List<String> specialTokensList = Arrays.stream(vocabulary.tokens(), baseTokens, allTokens).toList();
-
-        assert specialTokensList.stream().allMatch(token -> vocabulary.getIndex(token).isPresent());
-
-        Map<String, Integer> specialTokens = IntStream.range(0, specialTokensList.size()).boxed().collect(Collectors.toMap(i -> specialTokensList.get(i), i -> baseTokens + i));
-
-        return new Tokenizer(vocabulary, merges, LLAMA_3_PATTERN, specialTokens);
-    }
-
-    public static FloatTensor loadQuantized(GGMLTensorEntry entry) {
-        GGMLType ggmlType = entry.ggmlType();
-//        System.out.println("Loading quantized tensor of type " + entry.name());
-        return switch (ggmlType) {
-            //            case F32 -> new F32FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            case Q8_0 -> new Q8_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            case Q4_0 -> new Q4_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            //            case BF16 ->  new BF16FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            case F16 -> new F16FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            default -> throw new UnsupportedOperationException("Quantization format " + ggmlType);
-        };
     }
 
     public static FloatTensor[] loadArrayOfQuantized(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {