diff --git a/pom.xml b/pom.xml
index bed4d63b4..ad05d3e23 100644
--- a/pom.xml
+++ b/pom.xml
@@ -156,6 +156,16 @@
avro
${avro.version}
+
+
+ org.apache.orc
+ orc
+
+
+ org.apache.orc
+ orc-core
+ 2.1.3
+
diff --git a/xtable-core/pom.xml b/xtable-core/pom.xml
index 6bd5282c7..2180feedf 100644
--- a/xtable-core/pom.xml
+++ b/xtable-core/pom.xml
@@ -62,6 +62,7 @@
avro
+
org.scala-lang
diff --git a/xtable-core/src/main/java/org/apache/xtable/orc/ORCMetadataExtractor.java b/xtable-core/src/main/java/org/apache/xtable/orc/ORCMetadataExtractor.java
new file mode 100644
index 000000000..87fee6b63
--- /dev/null
+++ b/xtable-core/src/main/java/org/apache/xtable/orc/ORCMetadataExtractor.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.xtable.orc;
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+
+
+public class ORCMetadataExtractor {
+ private static final ORCMetadataExtractor INSTANCE = new ORCMetadataExtractor();
+ public static ORCMetadataExtractor getInstance() {
+ return INSTANCE;
+ }
+ public static TypeDescription getSchema(Configuration conf,Path filePath) throws IOException {
+ Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
+ return reader.getSchema();
+
+ }
+}
diff --git a/xtable-core/src/main/java/org/apache/xtable/orc/ORCSchemaExtractor.java b/xtable-core/src/main/java/org/apache/xtable/orc/ORCSchemaExtractor.java
new file mode 100644
index 000000000..c9edf53f6
--- /dev/null
+++ b/xtable-core/src/main/java/org/apache/xtable/orc/ORCSchemaExtractor.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.xtable.orc;
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+import lombok.AccessLevel;
+import lombok.NoArgsConstructor;
+
+
+import org.apache.xtable.exception.UnsupportedSchemaTypeException;
+import org.apache.xtable.model.schema.InternalField;
+import org.apache.xtable.model.schema.InternalSchema;
+import org.apache.xtable.model.schema.InternalType;
+import org.apache.xtable.schema.SchemaUtils;
+import org.apache.orc.TypeDescription;
+
+/**
+ * Class that converts Avro Schema {@link } to Canonical Schema {@link InternalSchema} and
+ * vice-versa. This conversion is fully reversible and there is a strict 1 to 1 mapping between avro
+ * data types and canonical data types.
+ */
+@NoArgsConstructor(access = AccessLevel.PRIVATE)
+public class ORCSchemaExtractor {
+ private static final org.apache.xtable.orc.ORCSchemaExtractor INSTANCE = new org.apache.xtable.orc.ORCSchemaExtractor();
+
+ public static org.apache.xtable.orc.ORCSchemaExtractor getInstance() {
+ return INSTANCE;
+ }
+
+ private static boolean isNullable(TypeDescription schema) {
+ List subFields = schema.getChildren();
+ for (TypeDescription subField : subFields) {
+ if (subField.equals(null)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Converts the ORC {@link } to {@link InternalSchema}.
+ *
+ * @param schema The schema being converted
+ * @param parentPath If this schema is nested within another, this will be a dot separated string
+ * representing the path from the top most field to the current schema.
+ * @return a converted schema
+ */
+ //TODO other types and precision and scale for decimal types
+ private InternalSchema toInternalSchema(
+ TypeDescription schema, String parentPath) {
+ InternalType newDataType;
+ Map metadata = new HashMap<>();
+ switch (schema.
+ getCategory()) {
+ case INT:
+ newDataType = InternalType.INT;
+ break;
+ case STRING:
+ newDataType = InternalType.STRING;
+ break;
+ case BOOLEAN:
+ newDataType = InternalType.BOOLEAN;
+ break;
+ case BYTE:
+ newDataType = InternalType.BYTES;
+ break;
+ case DOUBLE:
+ newDataType = InternalType.DOUBLE;
+ break;
+ case FLOAT:
+ newDataType = InternalType.FLOAT;
+ break;
+ case LONG:
+ newDataType = InternalType.LONG;
+ break;
+ case DECIMAL:
+ newDataType = InternalType.DECIMAL;
+ break;
+ case LIST:
+ int childId = schema.getId();
+ InternalSchema elementSchema =
+ toInternalSchema(
+ schema,
+ SchemaUtils.getFullyQualifiedPath(
+ parentPath, InternalField.Constants.ARRAY_ELEMENT_FIELD_NAME)
+ );
+ InternalField elementField =
+ InternalField.builder()
+ .name(InternalField.Constants.ARRAY_ELEMENT_FIELD_NAME)
+ .parentPath(parentPath)
+ .schema(elementSchema)
+ .fieldId(childId)
+ .build();
+ return InternalSchema.builder()
+ .name(schema.getFullFieldName())
+ .dataType(InternalType.LIST)
+ .comment(schema.toString())
+ .isNullable(isNullable(schema))
+ .fields(Collections.singletonList(elementField))
+ .build();
+ case UNION:
+ default:
+ throw new UnsupportedSchemaTypeException(
+ String.format("Unsupported schema type %s", schema));
+ }
+ return InternalSchema.builder()
+ .name(schema.getFullFieldName())
+ .dataType(newDataType)
+ .comment(schema.toString())
+ .isNullable(isNullable(schema))
+ .metadata(metadata.isEmpty() ? null : metadata)
+ .build();
+ }
+
+ // TODO refine fromInternalSchema types
+ public TypeDescription fromInternalSchema(InternalSchema internalSchema, String currentPath) {
+ TypeDescription type = null;
+ String fieldName = internalSchema.getName();
+ InternalType internalType = internalSchema.getDataType();
+ switch (internalType) {
+ case BOOLEAN:
+ type = TypeDescription.createBoolean();
+ break;
+ case INT:
+ type = TypeDescription.createInt();
+ break;
+ case LONG:
+ type = TypeDescription.createLong();
+ break;
+ case STRING:
+ type = TypeDescription.createString();
+ break;
+ case FLOAT:
+ type = TypeDescription.createFloat();
+ break;
+ case DECIMAL:
+ type = TypeDescription.createDecimal();
+ break;
+ case DATE:
+ type = TypeDescription.createDate();
+ break;
+ case TIMESTAMP:
+ type = TypeDescription.createTimestamp();
+ break;
+ default:
+ throw new UnsupportedSchemaTypeException(
+ "Encountered unhandled type during InternalSchema to ORC conversion:"
+ + internalType);
+ }
+ return type;
+ }
+}
+