fa366ccce6b61a279cb893a208d94ee5408cec63
[spark.git] / sql / core / src / main / scala / org / apache / spark / sql / execution / datasources / csv / CSVFileFormat.scala
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *    http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 package org.apache.spark.sql.execution.datasources.csv
19
20 import org.apache.hadoop.conf.Configuration
21 import org.apache.hadoop.fs.{FileStatus, Path}
22 import org.apache.hadoop.mapreduce._
23
24 import org.apache.spark.internal.Logging
25 import org.apache.spark.sql.{AnalysisException, SparkSession}
26 import org.apache.spark.sql.catalyst.InternalRow
27 import org.apache.spark.sql.catalyst.util.CompressionCodecs
28 import org.apache.spark.sql.execution.datasources._
29 import org.apache.spark.sql.sources._
30 import org.apache.spark.sql.types._
31 import org.apache.spark.util.SerializableConfiguration
32
33 /**
34  * Provides access to CSV data from pure SQL statements.
35  */
36 class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
37
38   override def shortName(): String = "csv"
39
40   override def isSplitable(
41       sparkSession: SparkSession,
42       options: Map[String, String],
43       path: Path): Boolean = {
44     val parsedOptions = new CSVOptions(
45       options,
46       columnPruning = sparkSession.sessionState.conf.csvColumnPruning,
47       sparkSession.sessionState.conf.sessionLocalTimeZone)
48     val csvDataSource = CSVDataSource(parsedOptions)
49     csvDataSource.isSplitable && super.isSplitable(sparkSession, options, path)
50   }
51
52   override def inferSchema(
53       sparkSession: SparkSession,
54       options: Map[String, String],
55       files: Seq[FileStatus]): Option[StructType] = {
56     val parsedOptions = new CSVOptions(
57       options,
58       columnPruning = sparkSession.sessionState.conf.csvColumnPruning,
59       sparkSession.sessionState.conf.sessionLocalTimeZone)
60
61     CSVDataSource(parsedOptions).inferSchema(sparkSession, files, parsedOptions)
62   }
63
64   override def prepareWrite(
65       sparkSession: SparkSession,
66       job: Job,
67       options: Map[String, String],
68       dataSchema: StructType): OutputWriterFactory = {
69     DataSourceUtils.verifyWriteSchema(this, dataSchema)
70     val conf = job.getConfiguration
71     val csvOptions = new CSVOptions(
72       options,
73       columnPruning = sparkSession.sessionState.conf.csvColumnPruning,
74       sparkSession.sessionState.conf.sessionLocalTimeZone)
75     csvOptions.compressionCodec.foreach { codec =>
76       CompressionCodecs.setCodecConfiguration(conf, codec)
77     }
78
79     new OutputWriterFactory {
80       override def newInstance(
81           path: String,
82           dataSchema: StructType,
83           context: TaskAttemptContext): OutputWriter = {
84         new CsvOutputWriter(path, dataSchema, context, csvOptions)
85       }
86
87       override def getFileExtension(context: TaskAttemptContext): String = {
88         ".csv" + CodecStreams.getCompressionExtension(context)
89       }
90     }
91   }
92
93   override def buildReader(
94       sparkSession: SparkSession,
95       dataSchema: StructType,
96       partitionSchema: StructType,
97       requiredSchema: StructType,
98       filters: Seq[Filter],
99       options: Map[String, String],
100       hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
101     DataSourceUtils.verifyReadSchema(this, dataSchema)
102     val broadcastedHadoopConf =
103       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
104
105     val parsedOptions = new CSVOptions(
106       options,
107       sparkSession.sessionState.conf.csvColumnPruning,
108       sparkSession.sessionState.conf.sessionLocalTimeZone,
109       sparkSession.sessionState.conf.columnNameOfCorruptRecord)
110
111     // Check a field requirement for corrupt records here to throw an exception in a driver side
112     dataSchema.getFieldIndex(parsedOptions.columnNameOfCorruptRecord).foreach { corruptFieldIndex =>
113       val f = dataSchema(corruptFieldIndex)
114       if (f.dataType != StringType || !f.nullable) {
115         throw new AnalysisException(
116           "The field for corrupt records must be string type and nullable")
117       }
118     }
119
120     if (requiredSchema.length == 1 &&
121       requiredSchema.head.name == parsedOptions.columnNameOfCorruptRecord) {
122       throw new AnalysisException(
123         "Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the\n" +
124           "referenced columns only include the internal corrupt record column\n" +
125           s"(named _corrupt_record by default). For example:\n" +
126           "spark.read.schema(schema).csv(file).filter($\"_corrupt_record\".isNotNull).count()\n" +
127           "and spark.read.schema(schema).csv(file).select(\"_corrupt_record\").show().\n" +
128           "Instead, you can cache or save the parsed results and then send the same query.\n" +
129           "For example, val df = spark.read.schema(schema).csv(file).cache() and then\n" +
130           "df.filter($\"_corrupt_record\".isNotNull).count()."
131       )
132     }
133     val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
134
135     (file: PartitionedFile) => {
136       val conf = broadcastedHadoopConf.value.value
137       val parser = new UnivocityParser(
138         StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
139         StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
140         parsedOptions)
141       CSVDataSource(parsedOptions).readFile(
142         conf,
143         file,
144         parser,
145         requiredSchema,
146         dataSchema,
147         caseSensitive)
148     }
149   }
150
151   override def toString: String = "CSV"
152
153   override def hashCode(): Int = getClass.hashCode()
154
155   override def equals(other: Any): Boolean = other.isInstanceOf[CSVFileFormat]
156 }
157
158 private[csv] class CsvOutputWriter(
159     path: String,
160     dataSchema: StructType,
161     context: TaskAttemptContext,
162     params: CSVOptions) extends OutputWriter with Logging {
163
164   private val writer = CodecStreams.createOutputStreamWriter(context, new Path(path))
165
166   private val gen = new UnivocityGenerator(dataSchema, writer, params)
167
168   override def write(row: InternalRow): Unit = gen.write(row)
169
170   override def close(): Unit = gen.close()
171 }