SQOOP-489. Cannot define partition keys for Hive tables created through Sqoop.
[sqoop.git] / src / test / com / cloudera / sqoop / hive / TestHiveImport.java
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 package com.cloudera.sqoop.hive;
20
21 import java.io.BufferedReader;
22 import java.io.File;
23 import java.io.FileReader;
24 import java.io.IOException;
25 import java.util.ArrayList;
26
27 import org.apache.commons.logging.Log;
28 import org.apache.commons.logging.LogFactory;
29 import org.apache.hadoop.conf.Configuration;
30 import org.apache.hadoop.fs.FileSystem;
31 import org.apache.hadoop.fs.Path;
32 import org.junit.Test;
33
34 import com.cloudera.sqoop.SqoopOptions;
35 import com.cloudera.sqoop.SqoopOptions.InvalidOptionsException;
36 import com.cloudera.sqoop.testutil.CommonArgs;
37 import com.cloudera.sqoop.testutil.HsqldbTestServer;
38 import com.cloudera.sqoop.testutil.ImportJobTestCase;
39 import com.cloudera.sqoop.tool.BaseSqoopTool;
40 import com.cloudera.sqoop.tool.CodeGenTool;
41 import com.cloudera.sqoop.tool.CreateHiveTableTool;
42 import com.cloudera.sqoop.tool.ImportTool;
43 import com.cloudera.sqoop.tool.SqoopTool;
44 import org.apache.commons.cli.ParseException;
45
46 /**
47 * Test HiveImport capability after an import to HDFS.
48 */
49 public class TestHiveImport extends ImportJobTestCase {
50
51 public static final Log LOG = LogFactory.getLog(
52 TestHiveImport.class.getName());
53
54 /**
55 * Sets the expected number of columns in the table being manipulated
56 * by the test. Under the hood, this sets the expected column names
57 * to DATA_COLi for 0 <= i < numCols.
58 * @param numCols the number of columns to be created.
59 */
60 private void setNumCols(int numCols) {
61 String [] cols = new String[numCols];
62 for (int i = 0; i < numCols; i++) {
63 cols[i] = "DATA_COL" + i;
64 }
65
66 setColNames(cols);
67 }
68
69 /**
70 * Create the argv to pass to Sqoop.
71 * @return the argv as an array of strings.
72 */
73 protected String [] getArgv(boolean includeHadoopFlags, String [] moreArgs) {
74 ArrayList<String> args = new ArrayList<String>();
75
76 if (includeHadoopFlags) {
77 CommonArgs.addHadoopFlags(args);
78 }
79
80 if (null != moreArgs) {
81 for (String arg: moreArgs) {
82 args.add(arg);
83 }
84 }
85
86 args.add("--table");
87 args.add(getTableName());
88 args.add("--warehouse-dir");
89 args.add(getWarehouseDir());
90 args.add("--connect");
91 args.add(HsqldbTestServer.getUrl());
92 args.add("--hive-import");
93 String [] colNames = getColNames();
94 if (null != colNames) {
95 args.add("--split-by");
96 args.add(colNames[0]);
97 } else {
98 fail("Could not determine column names.");
99 }
100
101 args.add("--num-mappers");
102 args.add("1");
103
104 for (String a : args) {
105 LOG.debug("ARG : "+ a);
106 }
107
108 return args.toArray(new String[0]);
109 }
110
111 /**
112 * @return the argv to supply to a create-table only job for Hive imports.
113 */
114 protected String [] getCreateTableArgv(boolean includeHadoopFlags,
115 String [] moreArgs) {
116
117 ArrayList<String> args = new ArrayList<String>();
118
119 if (null != moreArgs) {
120 for (String arg: moreArgs) {
121 args.add(arg);
122 }
123 }
124
125 args.add("--table");
126 args.add(getTableName());
127 args.add("--connect");
128 args.add(HsqldbTestServer.getUrl());
129
130 return args.toArray(new String[0]);
131 }
132
133 /**
134 * @return the argv to supply to a code-gen only job for Hive imports.
135 */
136 protected String [] getCodeGenArgs() {
137 ArrayList<String> args = new ArrayList<String>();
138
139 args.add("--table");
140 args.add(getTableName());
141 args.add("--connect");
142 args.add(HsqldbTestServer.getUrl());
143 args.add("--hive-import");
144
145 return args.toArray(new String[0]);
146 }
147
148 /**
149 * @return the argv to supply to a ddl-executing-only job for Hive imports.
150 */
151 protected String [] getCreateHiveTableArgs(String [] extraArgs) {
152 ArrayList<String> args = new ArrayList<String>();
153
154 args.add("--table");
155 args.add(getTableName());
156 args.add("--connect");
157 args.add(HsqldbTestServer.getUrl());
158
159 if (null != extraArgs) {
160 for (String arg : extraArgs) {
161 args.add(arg);
162 }
163 }
164
165 return args.toArray(new String[0]);
166 }
167
168 private SqoopOptions getSqoopOptions(String [] args, SqoopTool tool) {
169 SqoopOptions opts = null;
170 try {
171 opts = tool.parseArguments(args, null, null, true);
172 } catch (Exception e) {
173 fail("Invalid options: " + e.toString());
174 }
175
176 return opts;
177 }
178
179 private void runImportTest(String tableName, String [] types,
180 String [] values, String verificationScript, String [] args,
181 SqoopTool tool) throws IOException {
182
183 // create a table and populate it with a row...
184 createTableWithColTypes(types, values);
185
186 // set up our mock hive shell to compare our generated script
187 // against the correct expected one.
188 SqoopOptions options = getSqoopOptions(args, tool);
189 String hiveHome = options.getHiveHome();
190 assertNotNull("hive.home was not set", hiveHome);
191 Path testDataPath = new Path(new Path(hiveHome),
192 "scripts/" + verificationScript);
193 System.setProperty("expected.script", testDataPath.toString());
194
195 // verify that we can import it correctly into hive.
196 runImport(tool, args);
197 }
198
199 /** Test that we can generate a file containing the DDL and not import. */
200 @Test
201 public void testGenerateOnly() throws IOException {
202 final String TABLE_NAME = "GenerateOnly";
203 setCurTableName(TABLE_NAME);
204 setNumCols(1);
205
206 // Figure out where our target generated .q file is going to be.
207 SqoopOptions options = getSqoopOptions(getArgv(false, null),
208 new ImportTool());
209 Path ddlFile = new Path(new Path(options.getCodeOutputDir()),
210 TABLE_NAME + ".q");
211 FileSystem fs = FileSystem.getLocal(new Configuration());
212
213 // If it's already there, remove it before running the test to ensure
214 // that it's the current test that generated the file.
215 if (fs.exists(ddlFile)) {
216 if (!fs.delete(ddlFile, false)) {
217 LOG.warn("Could not delete previous ddl file: " + ddlFile);
218 }
219 }
220
221 // Run a basic import, but specify that we're just generating definitions.
222 String [] types = { "INTEGER" };
223 String [] vals = { "42" };
224 runImportTest(TABLE_NAME, types, vals, null, getCodeGenArgs(),
225 new CodeGenTool());
226
227 // Test that the generated definition file exists.
228 assertTrue("Couldn't find expected ddl file", fs.exists(ddlFile));
229
230 Path hiveImportPath = new Path(new Path(options.getWarehouseDir()),
231 TABLE_NAME);
232 assertFalse("Import actually happened!", fs.exists(hiveImportPath));
233 }
234
235
236 /** Test that strings and ints are handled in the normal fashion. */
237 @Test
238 public void testNormalHiveImport() throws IOException {
239 final String TABLE_NAME = "NORMAL_HIVE_IMPORT";
240 setCurTableName(TABLE_NAME);
241 setNumCols(3);
242 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
243 String [] vals = { "'test'", "42", "'somestring'" };
244 runImportTest(TABLE_NAME, types, vals, "normalImport.q",
245 getArgv(false, null), new ImportTool());
246 }
247
248 /** Test that table is created in hive with no data import. */
249 @Test
250 public void testCreateOnlyHiveImport() throws IOException {
251 final String TABLE_NAME = "CREATE_ONLY_HIVE_IMPORT";
252 setCurTableName(TABLE_NAME);
253 setNumCols(3);
254 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
255 String [] vals = { "'test'", "42", "'somestring'" };
256 runImportTest(TABLE_NAME, types, vals,
257 "createOnlyImport.q", getCreateHiveTableArgs(null),
258 new CreateHiveTableTool());
259 }
260
261 /**
262 * Test that table is created in hive and replaces the existing table if
263 * any.
264 */
265 @Test
266 public void testCreateOverwriteHiveImport() throws IOException {
267 final String TABLE_NAME = "CREATE_OVERWRITE_HIVE_IMPORT";
268 setCurTableName(TABLE_NAME);
269 setNumCols(3);
270 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
271 String [] vals = { "'test'", "42", "'somestring'" };
272 String [] extraArgs = {"--hive-overwrite", "--create-hive-table"};
273 runImportTest(TABLE_NAME, types, vals,
274 "createOverwriteImport.q", getCreateHiveTableArgs(extraArgs),
275 new CreateHiveTableTool());
276 }
277
278 /** Test that dates are coerced properly to strings. */
279 @Test
280 public void testDate() throws IOException {
281 final String TABLE_NAME = "DATE_HIVE_IMPORT";
282 setCurTableName(TABLE_NAME);
283 setNumCols(2);
284 String [] types = { "VARCHAR(32)", "DATE" };
285 String [] vals = { "'test'", "'2009-05-12'" };
286 runImportTest(TABLE_NAME, types, vals, "dateImport.q",
287 getArgv(false, null), new ImportTool());
288 }
289
290 /** Test that NUMERICs are coerced to doubles. */
291 @Test
292 public void testNumeric() throws IOException {
293 final String TABLE_NAME = "NUMERIC_HIVE_IMPORT";
294 setCurTableName(TABLE_NAME);
295 setNumCols(2);
296 String [] types = { "NUMERIC", "CHAR(64)" };
297 String [] vals = { "3.14159", "'foo'" };
298 runImportTest(TABLE_NAME, types, vals, "numericImport.q",
299 getArgv(false, null), new ImportTool());
300 }
301
302 /** If bin/hive returns an error exit status, we should get an IOException. */
303 @Test
304 public void testHiveExitFails() {
305 // The expected script is different than the one which would be generated
306 // by this, so we expect an IOException out.
307 final String TABLE_NAME = "FAILING_HIVE_IMPORT";
308 setCurTableName(TABLE_NAME);
309 setNumCols(2);
310 String [] types = { "NUMERIC", "CHAR(64)" };
311 String [] vals = { "3.14159", "'foo'" };
312 try {
313 runImportTest(TABLE_NAME, types, vals, "failingImport.q",
314 getArgv(false, null), new ImportTool());
315 // If we get here, then the run succeeded -- which is incorrect.
316 fail("FAILING_HIVE_IMPORT test should have thrown IOException");
317 } catch (IOException ioe) {
318 // expected; ok.
319 }
320 }
321
322 /** Test that we can set delimiters how we want them. */
323 @Test
324 public void testCustomDelimiters() throws IOException {
325 final String TABLE_NAME = "CUSTOM_DELIM_IMPORT";
326 setCurTableName(TABLE_NAME);
327 setNumCols(3);
328 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
329 String [] vals = { "'test'", "42", "'somestring'" };
330 String [] extraArgs = {
331 "--fields-terminated-by", ",",
332 "--lines-terminated-by", "|",
333 };
334 runImportTest(TABLE_NAME, types, vals, "customDelimImport.q",
335 getArgv(false, extraArgs), new ImportTool());
336 }
337
338 /**
339 * Test hive import with row that has new line in it.
340 */
341 @Test
342 public void testFieldWithHiveDelims() throws IOException,
343 InterruptedException {
344 final String TABLE_NAME = "FIELD_WITH_NL_HIVE_IMPORT";
345
346 LOG.info("Doing import of single row into FIELD_WITH_NL_HIVE_IMPORT table");
347 setCurTableName(TABLE_NAME);
348 setNumCols(3);
349 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
350 String[] vals = { "'test with \n new lines \n'", "42",
351 "'oh no " + '\01' + " field delims " + '\01' + "'", };
352 String[] moreArgs = { "--"+ BaseSqoopTool.HIVE_DROP_DELIMS_ARG };
353
354 runImportTest(TABLE_NAME, types, vals, "fieldWithNewlineImport.q",
355 getArgv(false, moreArgs), new ImportTool());
356
357 LOG.info("Validating data in single row is present in: "
358 + "FIELD_WITH_NL_HIVE_IMPORT table");
359
360 // Ideally, we would actually invoke hive code to verify that record with
361 // record and field delimiters have values replaced and that we have the
362 // proper number of hive records. Unfortunately, this is a non-trivial task,
363 // and better dealt with at an integration test level
364 //
365 // Instead, this assumes the path of the generated table and just validate
366 // map job output.
367
368 // Get and read the raw output file
369 String whDir = getWarehouseDir();
370 File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
371 File f = new File(p.toString());
372 FileReader fr = new FileReader(f);
373 BufferedReader br = new BufferedReader(fr);
374 try {
375 // verify the output
376 assertEquals(br.readLine(), "test with new lines " + '\01' + "42"
377 + '\01' + "oh no field delims ");
378 assertEquals(br.readLine(), null); // should only be one line
379 } catch (IOException ioe) {
380 fail("Unable to read files generated from hive");
381 } finally {
382 br.close();
383 }
384 }
385
386 /**
387 * Test hive import with row that has new line in it.
388 */
389 @Test
390 public void testFieldWithHiveDelimsReplacement() throws IOException,
391 InterruptedException {
392 final String TABLE_NAME = "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT";
393
394 LOG.info("Doing import of single row into "
395 + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
396 setCurTableName(TABLE_NAME);
397 setNumCols(3);
398 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
399 String[] vals = { "'test with\nnew lines\n'", "42",
400 "'oh no " + '\01' + " field delims " + '\01' + "'", };
401 String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " "};
402
403 runImportTest(TABLE_NAME, types, vals,
404 "fieldWithNewlineReplacementImport.q", getArgv(false, moreArgs),
405 new ImportTool());
406
407 LOG.info("Validating data in single row is present in: "
408 + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
409
410 // Ideally, we would actually invoke hive code to verify that record with
411 // record and field delimiters have values replaced and that we have the
412 // proper number of hive records. Unfortunately, this is a non-trivial task,
413 // and better dealt with at an integration test level
414 //
415 // Instead, this assumes the path of the generated table and just validate
416 // map job output.
417
418 // Get and read the raw output file
419 String whDir = getWarehouseDir();
420 File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
421 File f = new File(p.toString());
422 FileReader fr = new FileReader(f);
423 BufferedReader br = new BufferedReader(fr);
424 try {
425 // verify the output
426 assertEquals(br.readLine(), "test with new lines " + '\01' + "42"
427 + '\01' + "oh no field delims ");
428 assertEquals(br.readLine(), null); // should only be one line
429 } catch (IOException ioe) {
430 fail("Unable to read files generated from hive");
431 } finally {
432 br.close();
433 }
434 }
435
436 /**
437 * Test hive drop and replace option validation.
438 */
439 @Test
440 public void testHiveDropAndReplaceOptionValidation() throws ParseException {
441 LOG.info("Testing conflicting Hive delimiter drop/replace options");
442
443 setNumCols(3);
444 String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " ",
445 "--"+BaseSqoopTool.HIVE_DROP_DELIMS_ARG, };
446
447 ImportTool tool = new ImportTool();
448 try {
449 tool.validateOptions(tool.parseArguments(getArgv(false, moreArgs), null,
450 null, true));
451 fail("Expected InvalidOptionsException");
452 } catch (InvalidOptionsException ex) {
453 /* success */
454 }
455 }
456
457 /**
458 * Test hive import with row that has new line in it.
459 */
460 @Test
461 public void testImportHiveWithPartitions() throws IOException,
462 InterruptedException {
463 final String TABLE_NAME = "PARTITION_HIVE_IMPORT";
464
465 LOG.info("Doing import of single row into PARTITION_HIVE_IMPORT table");
466 setCurTableName(TABLE_NAME);
467 setNumCols(3);
468 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", };
469 String[] vals = { "'whoop'", "42", "'I am a row in a partition'", };
470 String[] moreArgs = { "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG, "ds",
471 "--" + BaseSqoopTool.HIVE_PARTITION_VALUE_ARG, "20110413", };
472
473 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
474 getArgv(false, moreArgs), new ImportTool());
475 }
476
477 /**
478 * If partition key is set to one of importing columns, we should get an
479 * IOException.
480 * */
481 @Test
482 public void testImportWithBadPartitionKey() {
483 final String TABLE_NAME = "FAILING_PARTITION_HIVE_IMPORT";
484
485 LOG.info("Doing import of single row into " + TABLE_NAME + " table");
486 setCurTableName(TABLE_NAME);
487 setNumCols(3);
488 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", };
489 String[] vals = { "'key'", "42", "'I am a row in a partition'", };
490
491 String partitionKey = getColNames()[0];
492
493 // Specify 1st column as partition key and import every column of the
494 // table by default (i.e. no --columns option).
495 String[] moreArgs1 = {
496 "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG,
497 partitionKey,
498 };
499
500 // Specify 1st column as both partition key and importing column.
501 String[] moreArgs2 = {
502 "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG,
503 partitionKey,
504 "--" + BaseSqoopTool.COLUMNS_ARG,
505 partitionKey,
506 };
507
508 // Test hive-import with the 1st args.
509 try {
510 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
511 getArgv(false, moreArgs1), new ImportTool());
512 fail(TABLE_NAME + " test should have thrown IOException");
513 } catch (IOException ioe) {
514 // expected; ok.
515 }
516
517 // Test hive-import with the 2nd args.
518 try {
519 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
520 getArgv(false, moreArgs2), new ImportTool());
521 fail(TABLE_NAME + " test should have thrown IOException");
522 } catch (IOException ioe) {
523 // expected; ok.
524 }
525
526 // Test create-hive-table with the 1st args.
527 try {
528 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
529 getCreateTableArgv(false, moreArgs1), new CreateHiveTableTool());
530 fail(TABLE_NAME + " test should have thrown IOException");
531 } catch (IOException ioe) {
532 // expected; ok.
533 }
534 }
535 }