Add ORC reader and writer

slgobinath · slgobinath · commit ca9abc2fd9e8 · 2020-08-13T23:00:24.000-04:00
diff --git a/big-data/orc-demo/pom.xml b/big-data/orc-demo/pom.xml
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>com.javahelps.orc</groupId>
+    <artifactId>orc-demo</artifactId>
+    <version>1.0-SNAPSHOT</version>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>14</source>
+                    <target>14</target>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.orc</groupId>
+            <artifactId>orc-core</artifactId>
+            <version>1.6.3</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.orc</groupId>
+            <artifactId>orc-tools</artifactId>
+            <version>1.6.3</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.orc</groupId>
+            <artifactId>orc-mapreduce</artifactId>
+            <version>1.6.3</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+            <version>3.3.0</version>
+        </dependency>
+
+        <dependency>
+            <groupId>com.google.guava</groupId>
+            <artifactId>guava</artifactId>
+            <version>29.0-jre</version>
+        </dependency>
+    </dependencies>
+
+</project>
diff --git a/big-data/orc-demo/src/main/java/com/javahelps/orc/OrcFileReader.java b/big-data/orc-demo/src/main/java/com/javahelps/orc/OrcFileReader.java
@@ -0,0 +1,123 @@
+package com.javahelps.orc;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.*;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.function.BiFunction;
+
+public class OrcFileReader {
+    private static final int BATCH_SIZE = 2048;
+
+    public static void main(String[] args) throws IOException {
+        List<Map<String, Object>> rows = read(new Configuration(), "orders.orc");
+        for (Map<String, Object> row : rows) {
+            System.out.println(row);
+        }
+    }
+
+    public static List<Map<String, Object>> read(Configuration configuration, String path)
+            throws IOException {
+        // Create a list to collect rows
+        List<Map<String, Object>> rows = new LinkedList<>();
+
+        // Create an ORC reader using the Hadoop fileSystem and path
+        try (Reader reader = OrcFile.createReader(new Path(path), OrcFile.readerOptions(configuration))) {
+            // Extract the schema and metadata from the reader
+            TypeDescription schema = reader.getSchema();
+            List<String> fieldNames = schema.getFieldNames();
+            List<TypeDescription> columnTypes = schema.getChildren();
+
+            // Select only order_id and price
+            List<Integer> selectedColumns = new ArrayList<>();
+            boolean[] columnsToRead = createColumnsToRead(schema, Set.of("order_id", "item_name", "price"), selectedColumns);
+
+            // Get the column vector references
+            int size = fieldNames.size();
+            BiFunction<ColumnVector, Integer, Object>[] mappers = new BiFunction[size];
+            for (int i : selectedColumns) {
+                TypeDescription type = columnTypes.get(i);
+                mappers[i] = createColumnReader(type);
+            }
+
+            // Pass the columnsToRead to the reader to read only the selected columns
+            try (RecordReader records = reader.rows(reader.options().include(columnsToRead))) {
+                // Read rows in batch for better performance.
+                VectorizedRowBatch batch = reader.getSchema().createRowBatch(BATCH_SIZE);
+                while (records.nextBatch(batch)) {
+                    for (int row = 0; row < batch.size; row++) {
+                        // Read rows from the batch
+                        Map<String, Object> map = new HashMap<>(selectedColumns.size());
+                        for (int col : selectedColumns) {
+                            ColumnVector columnVector = batch.cols[col];
+                            if (columnVector.isNull[row]) {
+                                map.put(fieldNames.get(col), null);
+                            } else {
+                                Object value = mappers[col].apply(columnVector, row);
+                                map.put(fieldNames.get(col), value);
+                            }
+                        }
+                        rows.add(map);
+                    }
+                }
+            }
+        }
+        return rows;
+    }
+
+    public static boolean[] createColumnsToRead(TypeDescription schema, Set<String> columns, List<Integer> indices) {
+        // Create an array of boolean
+        boolean[] columnsToRead = new boolean[schema.getMaximumId() + 1];
+        List<String> fieldNames = schema.getFieldNames();
+        List<TypeDescription> columnTypes = schema.getChildren();
+        for (int i = 0; i < fieldNames.size(); i++) {
+            if (columns.contains(fieldNames.get(i))) {
+                indices.add(i);
+                TypeDescription type = columnTypes.get(i);
+                for (int id = type.getId(); id <= type.getMaximumId(); id++) {
+                    columnsToRead[id] = true;
+                }
+            }
+        }
+        return columnsToRead;
+    }
+
+    public static BiFunction<ColumnVector, Integer, Object> createColumnReader(TypeDescription description) {
+        // Reference: https://orc.apache.org/docs/core-java.html
+        String type = description.getCategory().getName();
+        BiFunction<ColumnVector, Integer, Object> mapper;
+        if ("tinyint".equals(type)) {
+            mapper = (columnVector, row) -> (byte) ((LongColumnVector) columnVector).vector[row];
+        } else if ("smallint".equals(type)) {
+            mapper = (columnVector, row) -> (short) ((LongColumnVector) columnVector).vector[row];
+        } else if ("int".equals(type) || "date".equals(type)) {
+            // Date is represented as int epoch days
+            mapper = (columnVector, row) -> (int) ((LongColumnVector) columnVector).vector[row];
+        } else if ("bigint".equals(type)) {
+            mapper = (columnVector, row) -> ((LongColumnVector) columnVector).vector[row];
+        } else if ("boolean".equals(type)) {
+            mapper = (columnVector, row) -> ((LongColumnVector) columnVector).vector[row] == 1;
+        } else if ("float".equals(type)) {
+            mapper = (columnVector, row) -> (float) ((DoubleColumnVector) columnVector).vector[row];
+        } else if ("double".equals(type)) {
+            mapper = (columnVector, row) -> ((DoubleColumnVector) columnVector).vector[row];
+        } else if ("decimal".equals(type)) {
+            mapper = (columnVector, row) -> ((DecimalColumnVector) columnVector).vector[row].getHiveDecimal().bigDecimalValue();
+        } else if ("string".equals(type) || type.startsWith("varchar")) {
+            mapper = (columnVector, row) -> ((BytesColumnVector) columnVector).toString(row);
+        } else if ("char".equals(type)) {
+            mapper = (columnVector, row) -> ((BytesColumnVector) columnVector).toString(row).charAt(0);
+        } else if ("timestamp".equals(type)) {
+            mapper = (columnVector, row) -> ((TimestampColumnVector) columnVector).getTimestampAsLong(row);
+        } else {
+            throw new RuntimeException("Unsupported type " + type);
+        }
+        return mapper;
+    }
+}
diff --git a/big-data/orc-demo/src/main/java/com/javahelps/orc/OrcFileWriter.java b/big-data/orc-demo/src/main/java/com/javahelps/orc/OrcFileWriter.java
@@ -0,0 +1,109 @@
+package com.javahelps.orc;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.*;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.nio.charset.StandardCharsets;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.function.BiConsumer;
+
+public class OrcFileWriter {
+    public static void main(String[] args) throws IOException {
+        List<Map<String, Object>> data = new LinkedList<>();
+        data.add(Map.of("order_id", 1, "item_name", "Laptop", "price", 800.0f));
+        data.add(Map.of("order_id", 2, "item_name", "Mouse", "price", 150.0f));
+        data.add(Map.of("order_id", 3, "item_name", "Keyboard", "price", 250.0f));
+
+        write(new Configuration(), "orders.orc", "struct<order_id:int,item_name:string,price:float>", data);
+
+        System.out.println("Done");
+    }
+
+    public static void write(Configuration configuration, String path, String struct, List<Map<String, Object>> data) throws IOException {
+        // Create the schemas and extract metadata from the schema
+        TypeDescription schema = TypeDescription.fromString(struct);
+        List<String> fieldNames = schema.getFieldNames();
+        List<TypeDescription> columnTypes = schema.getChildren();
+
+        // Create a row batch
+        VectorizedRowBatch batch = schema.createRowBatch();
+
+        // Get the column vector references
+        List<BiConsumer<Integer, Object>> consumers = new ArrayList<>(columnTypes.size());
+        for (int i = 0; i < columnTypes.size(); i++) {
+            TypeDescription type = columnTypes.get(i);
+            ColumnVector vector = batch.cols[i];
+            consumers.add(createColumnWriter(type, vector));
+        }
+
+        // Open a writer to write the data to an ORC fle
+        try (Writer writer = OrcFile.createWriter(new Path(path),
+                OrcFile.writerOptions(configuration)
+                        .setSchema(schema))) {
+            for (Map<String, Object> row : data) {
+                // batch.size should be increased externally
+                int rowNum = batch.size++;
+
+                // Write each column to the associated column vector
+                for (int i = 0; i < fieldNames.size(); i++) {
+                    consumers.get(i).accept(rowNum, row.get(fieldNames.get(i)));
+                }
+
+                // If the buffer is full, write it to disk
+                if (batch.size == batch.getMaxSize()) {
+                    writer.addRowBatch(batch);
+                    batch.reset();
+                }
+            }
+
+            // Check unwritten rows before closing
+            if (batch.size != 0) {
+                writer.addRowBatch(batch);
+            }
+        }
+    }
+
+    public static BiConsumer<Integer, Object> createColumnWriter(TypeDescription description, ColumnVector columnVector) {
+        String type = description.getCategory().getName();
+        BiConsumer<Integer, Object> consumer;
+        if ("tinyint".equals(type)) {
+            consumer = (row, val) -> ((LongColumnVector) columnVector).vector[row] = ((Number) val).longValue();
+        } else if ("smallint".equals(type)) {
+            consumer = (row, val) -> ((LongColumnVector) columnVector).vector[row] = ((Number) val).longValue();
+        } else if ("int".equals(type) || "date".equals(type)) {
+            // Date is represented as int epoch days
+            consumer = (row, val) -> ((LongColumnVector) columnVector).vector[row] = ((Number) val).longValue();
+        } else if ("bigint".equals(type)) {
+            consumer = (row, val) -> ((LongColumnVector) columnVector).vector[row] = ((Number) val).longValue();
+        } else if ("boolean".equals(type)) {
+            consumer = (row, val) -> ((LongColumnVector) columnVector).vector[row] = (Boolean) val ? 1 : 0;
+        } else if ("float".equals(type)) {
+            consumer = (row, val) -> ((DoubleColumnVector) columnVector).vector[row] = ((Number) val).floatValue();
+        } else if ("double".equals(type)) {
+            consumer = (row, val) -> ((DoubleColumnVector) columnVector).vector[row] = ((Number) val).doubleValue();
+        } else if ("decimal".equals(type)) {
+            consumer = (row, val) -> ((DecimalColumnVector) columnVector).vector[row].set(HiveDecimal.create((BigDecimal) val));
+        } else if ("string".equals(type) || type.startsWith("varchar") || "char".equals(type)) {
+            consumer = (row, val) -> {
+                byte[] buffer = val.toString().getBytes(StandardCharsets.UTF_8);
+                ((BytesColumnVector) columnVector).setRef(row, buffer, 0, buffer.length);
+            };
+        } else if ("timestamp".equals(type)) {
+            consumer = (row, val) -> ((TimestampColumnVector) columnVector).set(row, (Timestamp) val);
+        } else {
+            throw new RuntimeException("Unsupported type " + type);
+        }
+        return consumer;
+    }
+}
diff --git a/big-data/orc-demo/src/main/resources/log4j.properties b/big-data/orc-demo/src/main/resources/log4j.properties
@@ -0,0 +1,6 @@
+log4j.rootLogger=ERROR, CA
+
+log4j.appender.CA=org.apache.log4j.ConsoleAppender
+
+log4j.appender.CA.layout=org.apache.log4j.PatternLayout
+log4j.appender.CA.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n