SQOOP-212. Hive import for existing table does not work.
[sqoop.git] / src / test / com / cloudera / sqoop / hive / TestHiveImport.java
1 /**
2 * Licensed to Cloudera, Inc. under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. Cloudera, Inc. licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 package com.cloudera.sqoop.hive;
20
21 import java.io.BufferedReader;
22 import java.io.File;
23 import java.io.FileReader;
24 import java.io.IOException;
25 import java.util.ArrayList;
26
27 import org.apache.commons.logging.Log;
28 import org.apache.commons.logging.LogFactory;
29 import org.apache.hadoop.conf.Configuration;
30 import org.apache.hadoop.fs.FileSystem;
31 import org.apache.hadoop.fs.Path;
32 import org.junit.Test;
33
34 import com.cloudera.sqoop.SqoopOptions;
35 import com.cloudera.sqoop.testutil.CommonArgs;
36 import com.cloudera.sqoop.testutil.HsqldbTestServer;
37 import com.cloudera.sqoop.testutil.ImportJobTestCase;
38 import com.cloudera.sqoop.tool.BaseSqoopTool;
39 import com.cloudera.sqoop.tool.CodeGenTool;
40 import com.cloudera.sqoop.tool.CreateHiveTableTool;
41 import com.cloudera.sqoop.tool.ImportTool;
42 import com.cloudera.sqoop.tool.SqoopTool;
43
44 /**
45 * Test HiveImport capability after an import to HDFS.
46 */
47 public class TestHiveImport extends ImportJobTestCase {
48
49 public static final Log LOG = LogFactory.getLog(
50 TestHiveImport.class.getName());
51
52 /**
53 * Sets the expected number of columns in the table being manipulated
54 * by the test. Under the hood, this sets the expected column names
55 * to DATA_COLi for 0 <= i < numCols.
56 * @param numCols the number of columns to be created.
57 */
58 private void setNumCols(int numCols) {
59 String [] cols = new String[numCols];
60 for (int i = 0; i < numCols; i++) {
61 cols[i] = "DATA_COL" + i;
62 }
63
64 setColNames(cols);
65 }
66
67 /**
68 * Create the argv to pass to Sqoop.
69 * @return the argv as an array of strings.
70 */
71 protected String [] getArgv(boolean includeHadoopFlags, String [] moreArgs) {
72 ArrayList<String> args = new ArrayList<String>();
73
74 if (includeHadoopFlags) {
75 CommonArgs.addHadoopFlags(args);
76 }
77
78 if (null != moreArgs) {
79 for (String arg: moreArgs) {
80 args.add(arg);
81 }
82 }
83
84 args.add("--table");
85 args.add(getTableName());
86 args.add("--warehouse-dir");
87 args.add(getWarehouseDir());
88 args.add("--connect");
89 args.add(HsqldbTestServer.getUrl());
90 args.add("--hive-import");
91 String [] colNames = getColNames();
92 if (null != colNames) {
93 args.add("--split-by");
94 args.add(colNames[0]);
95 } else {
96 fail("Could not determine column names.");
97 }
98
99 args.add("--num-mappers");
100 args.add("1");
101
102 for (String a : args) {
103 LOG.debug("ARG : "+ a);
104 }
105
106 return args.toArray(new String[0]);
107 }
108
109 /**
110 * @return the argv to supply to a code-gen only job for Hive imports.
111 */
112 protected String [] getCodeGenArgs() {
113 ArrayList<String> args = new ArrayList<String>();
114
115 args.add("--table");
116 args.add(getTableName());
117 args.add("--connect");
118 args.add(HsqldbTestServer.getUrl());
119 args.add("--hive-import");
120
121 return args.toArray(new String[0]);
122 }
123
124 /**
125 * @return the argv to supply to a ddl-executing-only job for Hive imports.
126 */
127 protected String [] getCreateHiveTableArgs(String [] extraArgs) {
128 ArrayList<String> args = new ArrayList<String>();
129
130 args.add("--table");
131 args.add(getTableName());
132 args.add("--connect");
133 args.add(HsqldbTestServer.getUrl());
134
135 if (null != extraArgs) {
136 for (String arg : extraArgs) {
137 args.add(arg);
138 }
139 }
140
141 return args.toArray(new String[0]);
142 }
143
144 private SqoopOptions getSqoopOptions(String [] args, SqoopTool tool) {
145 SqoopOptions opts = null;
146 try {
147 opts = tool.parseArguments(args, null, null, true);
148 } catch (Exception e) {
149 fail("Invalid options: " + e.toString());
150 }
151
152 return opts;
153 }
154
155 private void runImportTest(String tableName, String [] types,
156 String [] values, String verificationScript, String [] args,
157 SqoopTool tool) throws IOException {
158
159 // create a table and populate it with a row...
160 createTableWithColTypes(types, values);
161
162 // set up our mock hive shell to compare our generated script
163 // against the correct expected one.
164 SqoopOptions options = getSqoopOptions(args, tool);
165 String hiveHome = options.getHiveHome();
166 assertNotNull("hive.home was not set", hiveHome);
167 Path testDataPath = new Path(new Path(hiveHome),
168 "scripts/" + verificationScript);
169 System.setProperty("expected.script", testDataPath.toString());
170
171 // verify that we can import it correctly into hive.
172 runImport(tool, args);
173 }
174
175 /** Test that we can generate a file containing the DDL and not import. */
176 @Test
177 public void testGenerateOnly() throws IOException {
178 final String TABLE_NAME = "GenerateOnly";
179 setCurTableName(TABLE_NAME);
180 setNumCols(1);
181
182 // Figure out where our target generated .q file is going to be.
183 SqoopOptions options = getSqoopOptions(getArgv(false, null),
184 new ImportTool());
185 Path ddlFile = new Path(new Path(options.getCodeOutputDir()),
186 TABLE_NAME + ".q");
187 FileSystem fs = FileSystem.getLocal(new Configuration());
188
189 // If it's already there, remove it before running the test to ensure
190 // that it's the current test that generated the file.
191 if (fs.exists(ddlFile)) {
192 if (!fs.delete(ddlFile, false)) {
193 LOG.warn("Could not delete previous ddl file: " + ddlFile);
194 }
195 }
196
197 // Run a basic import, but specify that we're just generating definitions.
198 String [] types = { "INTEGER" };
199 String [] vals = { "42" };
200 runImportTest(TABLE_NAME, types, vals, null, getCodeGenArgs(),
201 new CodeGenTool());
202
203 // Test that the generated definition file exists.
204 assertTrue("Couldn't find expected ddl file", fs.exists(ddlFile));
205
206 Path hiveImportPath = new Path(new Path(options.getWarehouseDir()),
207 TABLE_NAME);
208 assertFalse("Import actually happened!", fs.exists(hiveImportPath));
209 }
210
211
212 /** Test that strings and ints are handled in the normal fashion. */
213 @Test
214 public void testNormalHiveImport() throws IOException {
215 final String TABLE_NAME = "NORMAL_HIVE_IMPORT";
216 setCurTableName(TABLE_NAME);
217 setNumCols(3);
218 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
219 String [] vals = { "'test'", "42", "'somestring'" };
220 runImportTest(TABLE_NAME, types, vals, "normalImport.q",
221 getArgv(false, null), new ImportTool());
222 }
223
224 /** Test that table is created in hive with no data import. */
225 @Test
226 public void testCreateOnlyHiveImport() throws IOException {
227 final String TABLE_NAME = "CREATE_ONLY_HIVE_IMPORT";
228 setCurTableName(TABLE_NAME);
229 setNumCols(3);
230 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
231 String [] vals = { "'test'", "42", "'somestring'" };
232 runImportTest(TABLE_NAME, types, vals,
233 "createOnlyImport.q", getCreateHiveTableArgs(null),
234 new CreateHiveTableTool());
235 }
236
237 /**
238 * Test that table is created in hive and replaces the existing table if
239 * any.
240 */
241 @Test
242 public void testCreateOverwriteHiveImport() throws IOException {
243 final String TABLE_NAME = "CREATE_OVERWRITE_HIVE_IMPORT";
244 setCurTableName(TABLE_NAME);
245 setNumCols(3);
246 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
247 String [] vals = { "'test'", "42", "'somestring'" };
248 String [] extraArgs = {"--hive-overwrite", "--create-hive-table"};
249 runImportTest(TABLE_NAME, types, vals,
250 "createOverwriteImport.q", getCreateHiveTableArgs(extraArgs),
251 new CreateHiveTableTool());
252 }
253
254 /** Test that dates are coerced properly to strings. */
255 @Test
256 public void testDate() throws IOException {
257 final String TABLE_NAME = "DATE_HIVE_IMPORT";
258 setCurTableName(TABLE_NAME);
259 setNumCols(2);
260 String [] types = { "VARCHAR(32)", "DATE" };
261 String [] vals = { "'test'", "'2009-05-12'" };
262 runImportTest(TABLE_NAME, types, vals, "dateImport.q",
263 getArgv(false, null), new ImportTool());
264 }
265
266 /** Test that NUMERICs are coerced to doubles. */
267 @Test
268 public void testNumeric() throws IOException {
269 final String TABLE_NAME = "NUMERIC_HIVE_IMPORT";
270 setCurTableName(TABLE_NAME);
271 setNumCols(2);
272 String [] types = { "NUMERIC", "CHAR(64)" };
273 String [] vals = { "3.14159", "'foo'" };
274 runImportTest(TABLE_NAME, types, vals, "numericImport.q",
275 getArgv(false, null), new ImportTool());
276 }
277
278 /** If bin/hive returns an error exit status, we should get an IOException. */
279 @Test
280 public void testHiveExitFails() {
281 // The expected script is different than the one which would be generated
282 // by this, so we expect an IOException out.
283 final String TABLE_NAME = "FAILING_HIVE_IMPORT";
284 setCurTableName(TABLE_NAME);
285 setNumCols(2);
286 String [] types = { "NUMERIC", "CHAR(64)" };
287 String [] vals = { "3.14159", "'foo'" };
288 try {
289 runImportTest(TABLE_NAME, types, vals, "failingImport.q",
290 getArgv(false, null), new ImportTool());
291 // If we get here, then the run succeeded -- which is incorrect.
292 fail("FAILING_HIVE_IMPORT test should have thrown IOException");
293 } catch (IOException ioe) {
294 // expected; ok.
295 }
296 }
297
298 /** Test that we can set delimiters how we want them. */
299 @Test
300 public void testCustomDelimiters() throws IOException {
301 final String TABLE_NAME = "CUSTOM_DELIM_IMPORT";
302 setCurTableName(TABLE_NAME);
303 setNumCols(3);
304 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
305 String [] vals = { "'test'", "42", "'somestring'" };
306 String [] extraArgs = {
307 "--fields-terminated-by", ",",
308 "--lines-terminated-by", "|",
309 };
310 runImportTest(TABLE_NAME, types, vals, "customDelimImport.q",
311 getArgv(false, extraArgs), new ImportTool());
312 }
313
314 /**
315 * Test hive import with row that has new line in it.
316 */
317 @Test
318 public void testFieldWithHiveDelims() throws IOException,
319 InterruptedException {
320 final String TABLE_NAME = "FIELD_WITH_NL_HIVE_IMPORT";
321
322 LOG.info("Doing import of single row into FIELD_WITH_NL_HIVE_IMPORT table");
323 setCurTableName(TABLE_NAME);
324 setNumCols(3);
325 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
326 String[] vals = { "'test with \n new lines \n'", "42",
327 "'oh no " + '\01' + " field delims " + '\01' + "'", };
328 String[] moreArgs = { "--"+ BaseSqoopTool.HIVE_DROP_DELIMS_ARG };
329
330 runImportTest(TABLE_NAME, types, vals, "fieldWithNewlineImport.q",
331 getArgv(false, moreArgs), new ImportTool());
332
333 LOG.info("Validating data in single row is present in: "
334 + "FIELD_WITH_NL_HIVE_IMPORT table");
335
336 // Ideally, we would actually invoke hive code to verify that record with
337 // record and field delimiters have values replaced and that we have the
338 // proper number of hive records. Unfortunately, this is a non-trivial task,
339 // and better dealt with at an integration test level
340 //
341 // Instead, this assumes the path of the generated table and just validate
342 // map job output.
343
344 // Get and read the raw output file
345 String whDir = getWarehouseDir();
346 File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
347 File f = new File(p.toString());
348 FileReader fr = new FileReader(f);
349 BufferedReader br = new BufferedReader(fr);
350 try {
351 // verify the output
352 assertEquals(br.readLine(), "test with new lines " + '\01' + "42"
353 + '\01' + "oh no field delims ");
354 assertEquals(br.readLine(), null); // should only be one line
355 } catch (IOException ioe) {
356 fail("Unable to read files generated from hive");
357 } finally {
358 br.close();
359 }
360 }
361
362 /**
363 * Test hive import with row that has new line in it.
364 */
365 @Test
366 public void testImportHiveWithPartitions() throws IOException,
367 InterruptedException {
368 final String TABLE_NAME = "PARTITION_HIVE_IMPORT";
369
370 LOG.info("Doing import of single row into PARTITION_HIVE_IMPORT table");
371 setCurTableName(TABLE_NAME);
372 setNumCols(3);
373 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", };
374 String[] vals = { "'whoop'", "42", "'I am a row in a partition'", };
375 String[] moreArgs = { "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG, "ds",
376 "--" + BaseSqoopTool.HIVE_PARTITION_VALUE_ARG, "20110413", };
377
378 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
379 getArgv(false, moreArgs), new ImportTool());
380 }
381
382 }