SQOOP-931: Integrate HCatalog with Sqoop
[sqoop.git] / src / test / com / cloudera / sqoop / hive / TestHiveImport.java
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 package com.cloudera.sqoop.hive;
20
21 import java.io.BufferedReader;
22 import java.io.File;
23 import java.io.FileReader;
24 import java.io.IOException;
25 import java.util.ArrayList;
26
27 import org.apache.commons.logging.Log;
28 import org.apache.commons.logging.LogFactory;
29 import org.apache.hadoop.conf.Configuration;
30 import org.apache.hadoop.fs.FileSystem;
31 import org.apache.hadoop.fs.Path;
32 import org.junit.Test;
33
34 import com.cloudera.sqoop.SqoopOptions;
35 import com.cloudera.sqoop.SqoopOptions.InvalidOptionsException;
36 import com.cloudera.sqoop.testutil.CommonArgs;
37 import com.cloudera.sqoop.testutil.HsqldbTestServer;
38 import com.cloudera.sqoop.testutil.ImportJobTestCase;
39 import com.cloudera.sqoop.tool.BaseSqoopTool;
40 import com.cloudera.sqoop.tool.CodeGenTool;
41 import com.cloudera.sqoop.tool.CreateHiveTableTool;
42 import com.cloudera.sqoop.tool.ImportTool;
43 import com.cloudera.sqoop.tool.SqoopTool;
44 import org.apache.commons.cli.ParseException;
45
46 /**
47 * Test HiveImport capability after an import to HDFS.
48 */
49 public class TestHiveImport extends ImportJobTestCase {
50
51 public static final Log LOG = LogFactory.getLog(
52 TestHiveImport.class.getName());
53
54 public void setUp() {
55 super.setUp();
56 HiveImport.setTestMode(true);
57 }
58
59 public void tearDown() {
60 super.tearDown();
61 HiveImport.setTestMode(false);
62 }
63
64 /**
65 * Sets the expected number of columns in the table being manipulated
66 * by the test. Under the hood, this sets the expected column names
67 * to DATA_COLi for 0 <= i < numCols.
68 * @param numCols the number of columns to be created.
69 */
70 private void setNumCols(int numCols) {
71 String [] cols = new String[numCols];
72 for (int i = 0; i < numCols; i++) {
73 cols[i] = "DATA_COL" + i;
74 }
75
76 setColNames(cols);
77 }
78
79 /**
80 * Create the argv to pass to Sqoop.
81 * @return the argv as an array of strings.
82 */
83 protected String [] getArgv(boolean includeHadoopFlags, String [] moreArgs) {
84 ArrayList<String> args = new ArrayList<String>();
85
86 if (includeHadoopFlags) {
87 CommonArgs.addHadoopFlags(args);
88 }
89
90 if (null != moreArgs) {
91 for (String arg: moreArgs) {
92 args.add(arg);
93 }
94 }
95
96 args.add("--table");
97 args.add(getTableName());
98 args.add("--warehouse-dir");
99 args.add(getWarehouseDir());
100 args.add("--connect");
101 args.add(HsqldbTestServer.getUrl());
102 args.add("--hive-import");
103 String [] colNames = getColNames();
104 if (null != colNames) {
105 args.add("--split-by");
106 args.add(colNames[0]);
107 } else {
108 fail("Could not determine column names.");
109 }
110
111 args.add("--num-mappers");
112 args.add("1");
113
114 for (String a : args) {
115 LOG.debug("ARG : "+ a);
116 }
117
118 return args.toArray(new String[0]);
119 }
120
121 /**
122 * @return the argv to supply to a create-table only job for Hive imports.
123 */
124 protected String [] getCreateTableArgv(boolean includeHadoopFlags,
125 String [] moreArgs) {
126
127 ArrayList<String> args = new ArrayList<String>();
128
129 if (null != moreArgs) {
130 for (String arg: moreArgs) {
131 args.add(arg);
132 }
133 }
134
135 args.add("--table");
136 args.add(getTableName());
137 args.add("--connect");
138 args.add(HsqldbTestServer.getUrl());
139
140 return args.toArray(new String[0]);
141 }
142
143 /**
144 * @return the argv to supply to a code-gen only job for Hive imports.
145 */
146 protected String [] getCodeGenArgs() {
147 ArrayList<String> args = new ArrayList<String>();
148
149 args.add("--table");
150 args.add(getTableName());
151 args.add("--connect");
152 args.add(HsqldbTestServer.getUrl());
153 args.add("--hive-import");
154
155 return args.toArray(new String[0]);
156 }
157
158 /**
159 * @return the argv to supply to a ddl-executing-only job for Hive imports.
160 */
161 protected String [] getCreateHiveTableArgs(String [] extraArgs) {
162 ArrayList<String> args = new ArrayList<String>();
163
164 args.add("--table");
165 args.add(getTableName());
166 args.add("--connect");
167 args.add(HsqldbTestServer.getUrl());
168
169 if (null != extraArgs) {
170 for (String arg : extraArgs) {
171 args.add(arg);
172 }
173 }
174
175 return args.toArray(new String[0]);
176 }
177
178 private SqoopOptions getSqoopOptions(String [] args, SqoopTool tool) {
179 SqoopOptions opts = null;
180 try {
181 opts = tool.parseArguments(args, null, null, true);
182 } catch (Exception e) {
183 fail("Invalid options: " + e.toString());
184 }
185
186 return opts;
187 }
188
189 private void runImportTest(String tableName, String [] types,
190 String [] values, String verificationScript, String [] args,
191 SqoopTool tool) throws IOException {
192
193 // create a table and populate it with a row...
194 createTableWithColTypes(types, values);
195
196 // set up our mock hive shell to compare our generated script
197 // against the correct expected one.
198 SqoopOptions options = getSqoopOptions(args, tool);
199 String hiveHome = options.getHiveHome();
200 assertNotNull("hive.home was not set", hiveHome);
201 String testDataPath = new Path(new Path(hiveHome),
202 "scripts/" + verificationScript).toString();
203 System.setProperty("expected.script",
204 new File(testDataPath).getAbsolutePath());
205
206 // verify that we can import it correctly into hive.
207 runImport(tool, args);
208 }
209
210 /** Test that we can generate a file containing the DDL and not import. */
211 @Test
212 public void testGenerateOnly() throws IOException {
213 final String TABLE_NAME = "GenerateOnly";
214 setCurTableName(TABLE_NAME);
215 setNumCols(1);
216
217 // Figure out where our target generated .q file is going to be.
218 SqoopOptions options = getSqoopOptions(getArgv(false, null),
219 new ImportTool());
220 Path ddlFile = new Path(new Path(options.getCodeOutputDir()),
221 TABLE_NAME + ".q");
222 FileSystem fs = FileSystem.getLocal(new Configuration());
223
224 // If it's already there, remove it before running the test to ensure
225 // that it's the current test that generated the file.
226 if (fs.exists(ddlFile)) {
227 if (!fs.delete(ddlFile, false)) {
228 LOG.warn("Could not delete previous ddl file: " + ddlFile);
229 }
230 }
231
232 // Run a basic import, but specify that we're just generating definitions.
233 String [] types = { "INTEGER" };
234 String [] vals = { "42" };
235 runImportTest(TABLE_NAME, types, vals, null, getCodeGenArgs(),
236 new CodeGenTool());
237
238 // Test that the generated definition file exists.
239 assertTrue("Couldn't find expected ddl file", fs.exists(ddlFile));
240
241 Path hiveImportPath = new Path(new Path(options.getWarehouseDir()),
242 TABLE_NAME);
243 assertFalse("Import actually happened!", fs.exists(hiveImportPath));
244 }
245
246
247 /** Test that strings and ints are handled in the normal fashion. */
248 @Test
249 public void testNormalHiveImport() throws IOException {
250 final String TABLE_NAME = "NORMAL_HIVE_IMPORT";
251 setCurTableName(TABLE_NAME);
252 setNumCols(3);
253 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
254 String [] vals = { "'test'", "42", "'somestring'" };
255 runImportTest(TABLE_NAME, types, vals, "normalImport.q",
256 getArgv(false, null), new ImportTool());
257 }
258
259 /** Test that table is created in hive with no data import. */
260 @Test
261 public void testCreateOnlyHiveImport() throws IOException {
262 final String TABLE_NAME = "CREATE_ONLY_HIVE_IMPORT";
263 setCurTableName(TABLE_NAME);
264 setNumCols(3);
265 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
266 String [] vals = { "'test'", "42", "'somestring'" };
267 runImportTest(TABLE_NAME, types, vals,
268 "createOnlyImport.q", getCreateHiveTableArgs(null),
269 new CreateHiveTableTool());
270 }
271
272 /**
273 * Test that table is created in hive and replaces the existing table if
274 * any.
275 */
276 @Test
277 public void testCreateOverwriteHiveImport() throws IOException {
278 final String TABLE_NAME = "CREATE_OVERWRITE_HIVE_IMPORT";
279 setCurTableName(TABLE_NAME);
280 setNumCols(3);
281 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
282 String [] vals = { "'test'", "42", "'somestring'" };
283 String [] extraArgs = {"--hive-overwrite", "--create-hive-table"};
284 runImportTest(TABLE_NAME, types, vals,
285 "createOverwriteImport.q", getCreateHiveTableArgs(extraArgs),
286 new CreateHiveTableTool());
287 }
288
289 /** Test that dates are coerced properly to strings. */
290 @Test
291 public void testDate() throws IOException {
292 final String TABLE_NAME = "DATE_HIVE_IMPORT";
293 setCurTableName(TABLE_NAME);
294 setNumCols(2);
295 String [] types = { "VARCHAR(32)", "DATE" };
296 String [] vals = { "'test'", "'2009-05-12'" };
297 runImportTest(TABLE_NAME, types, vals, "dateImport.q",
298 getArgv(false, null), new ImportTool());
299 }
300
301 /** Test that NUMERICs are coerced to doubles. */
302 @Test
303 public void testNumeric() throws IOException {
304 final String TABLE_NAME = "NUMERIC_HIVE_IMPORT";
305 setCurTableName(TABLE_NAME);
306 setNumCols(2);
307 String [] types = { "NUMERIC", "CHAR(64)" };
308 String [] vals = { "3.14159", "'foo'" };
309 runImportTest(TABLE_NAME, types, vals, "numericImport.q",
310 getArgv(false, null), new ImportTool());
311 }
312
313 /** If bin/hive returns an error exit status, we should get an IOException. */
314 @Test
315 public void testHiveExitFails() {
316 // The expected script is different than the one which would be generated
317 // by this, so we expect an IOException out.
318 final String TABLE_NAME = "FAILING_HIVE_IMPORT";
319 setCurTableName(TABLE_NAME);
320 setNumCols(2);
321 String [] types = { "NUMERIC", "CHAR(64)" };
322 String [] vals = { "3.14159", "'foo'" };
323 try {
324 runImportTest(TABLE_NAME, types, vals, "failingImport.q",
325 getArgv(false, null), new ImportTool());
326 // If we get here, then the run succeeded -- which is incorrect.
327 fail("FAILING_HIVE_IMPORT test should have thrown IOException");
328 } catch (IOException ioe) {
329 // expected; ok.
330 }
331 }
332
333 /** Test that we can set delimiters how we want them. */
334 @Test
335 public void testCustomDelimiters() throws IOException {
336 final String TABLE_NAME = "CUSTOM_DELIM_IMPORT";
337 setCurTableName(TABLE_NAME);
338 setNumCols(3);
339 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
340 String [] vals = { "'test'", "42", "'somestring'" };
341 String [] extraArgs = {
342 "--fields-terminated-by", ",",
343 "--lines-terminated-by", "|",
344 };
345 runImportTest(TABLE_NAME, types, vals, "customDelimImport.q",
346 getArgv(false, extraArgs), new ImportTool());
347 }
348
349 /**
350 * Test hive import with row that has new line in it.
351 */
352 @Test
353 public void testFieldWithHiveDelims() throws IOException,
354 InterruptedException {
355 final String TABLE_NAME = "FIELD_WITH_NL_HIVE_IMPORT";
356
357 LOG.info("Doing import of single row into FIELD_WITH_NL_HIVE_IMPORT table");
358 setCurTableName(TABLE_NAME);
359 setNumCols(3);
360 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
361 String[] vals = { "'test with \n new lines \n'", "42",
362 "'oh no " + '\01' + " field delims " + '\01' + "'", };
363 String[] moreArgs = { "--"+ BaseSqoopTool.HIVE_DROP_DELIMS_ARG };
364
365 runImportTest(TABLE_NAME, types, vals, "fieldWithNewlineImport.q",
366 getArgv(false, moreArgs), new ImportTool());
367
368 LOG.info("Validating data in single row is present in: "
369 + "FIELD_WITH_NL_HIVE_IMPORT table");
370
371 // Ideally, we would actually invoke hive code to verify that record with
372 // record and field delimiters have values replaced and that we have the
373 // proper number of hive records. Unfortunately, this is a non-trivial task,
374 // and better dealt with at an integration test level
375 //
376 // Instead, this assumes the path of the generated table and just validate
377 // map job output.
378
379 // Get and read the raw output file
380 String whDir = getWarehouseDir();
381 File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
382 File f = new File(p.toString());
383 FileReader fr = new FileReader(f);
384 BufferedReader br = new BufferedReader(fr);
385 try {
386 // verify the output
387 assertEquals(br.readLine(), "test with new lines " + '\01' + "42"
388 + '\01' + "oh no field delims ");
389 assertEquals(br.readLine(), null); // should only be one line
390 } catch (IOException ioe) {
391 fail("Unable to read files generated from hive");
392 } finally {
393 br.close();
394 }
395 }
396
397 /**
398 * Test hive import with row that has new line in it.
399 */
400 @Test
401 public void testFieldWithHiveDelimsReplacement() throws IOException,
402 InterruptedException {
403 final String TABLE_NAME = "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT";
404
405 LOG.info("Doing import of single row into "
406 + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
407 setCurTableName(TABLE_NAME);
408 setNumCols(3);
409 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
410 String[] vals = { "'test with\nnew lines\n'", "42",
411 "'oh no " + '\01' + " field delims " + '\01' + "'", };
412 String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " "};
413
414 runImportTest(TABLE_NAME, types, vals,
415 "fieldWithNewlineReplacementImport.q", getArgv(false, moreArgs),
416 new ImportTool());
417
418 LOG.info("Validating data in single row is present in: "
419 + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
420
421 // Ideally, we would actually invoke hive code to verify that record with
422 // record and field delimiters have values replaced and that we have the
423 // proper number of hive records. Unfortunately, this is a non-trivial task,
424 // and better dealt with at an integration test level
425 //
426 // Instead, this assumes the path of the generated table and just validate
427 // map job output.
428
429 // Get and read the raw output file
430 String whDir = getWarehouseDir();
431 File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
432 File f = new File(p.toString());
433 FileReader fr = new FileReader(f);
434 BufferedReader br = new BufferedReader(fr);
435 try {
436 // verify the output
437 assertEquals(br.readLine(), "test with new lines " + '\01' + "42"
438 + '\01' + "oh no field delims ");
439 assertEquals(br.readLine(), null); // should only be one line
440 } catch (IOException ioe) {
441 fail("Unable to read files generated from hive");
442 } finally {
443 br.close();
444 }
445 }
446
447 /**
448 * Test hive drop and replace option validation.
449 */
450 @Test
451 public void testHiveDropAndReplaceOptionValidation() throws ParseException {
452 LOG.info("Testing conflicting Hive delimiter drop/replace options");
453
454 setNumCols(3);
455 String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " ",
456 "--"+BaseSqoopTool.HIVE_DROP_DELIMS_ARG, };
457
458 ImportTool tool = new ImportTool();
459 try {
460 tool.validateOptions(tool.parseArguments(getArgv(false, moreArgs), null,
461 null, true));
462 fail("Expected InvalidOptionsException");
463 } catch (InvalidOptionsException ex) {
464 /* success */
465 }
466 }
467
468 /**
469 * Test hive import with row that has new line in it.
470 */
471 @Test
472 public void testImportHiveWithPartitions() throws IOException,
473 InterruptedException {
474 final String TABLE_NAME = "PARTITION_HIVE_IMPORT";
475
476 LOG.info("Doing import of single row into PARTITION_HIVE_IMPORT table");
477 setCurTableName(TABLE_NAME);
478 setNumCols(3);
479 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", };
480 String[] vals = { "'whoop'", "42", "'I am a row in a partition'", };
481 String[] moreArgs = { "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG, "ds",
482 "--" + BaseSqoopTool.HIVE_PARTITION_VALUE_ARG, "20110413", };
483
484 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
485 getArgv(false, moreArgs), new ImportTool());
486 }
487
488 /**
489 * If partition key is set to one of importing columns, we should get an
490 * IOException.
491 * */
492 @Test
493 public void testImportWithBadPartitionKey() {
494 final String TABLE_NAME = "FAILING_PARTITION_HIVE_IMPORT";
495
496 LOG.info("Doing import of single row into " + TABLE_NAME + " table");
497 setCurTableName(TABLE_NAME);
498 setNumCols(3);
499 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", };
500 String[] vals = { "'key'", "42", "'I am a row in a partition'", };
501
502 String partitionKey = getColNames()[0];
503
504 // Specify 1st column as partition key and import every column of the
505 // table by default (i.e. no --columns option).
506 String[] moreArgs1 = {
507 "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG,
508 partitionKey,
509 };
510
511 // Specify 1st column as both partition key and importing column.
512 String[] moreArgs2 = {
513 "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG,
514 partitionKey,
515 "--" + BaseSqoopTool.COLUMNS_ARG,
516 partitionKey,
517 };
518
519 // Test hive-import with the 1st args.
520 try {
521 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
522 getArgv(false, moreArgs1), new ImportTool());
523 fail(TABLE_NAME + " test should have thrown IOException");
524 } catch (IOException ioe) {
525 // expected; ok.
526 }
527
528 // Test hive-import with the 2nd args.
529 try {
530 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
531 getArgv(false, moreArgs2), new ImportTool());
532 fail(TABLE_NAME + " test should have thrown IOException");
533 } catch (IOException ioe) {
534 // expected; ok.
535 }
536
537 // Test create-hive-table with the 1st args.
538 try {
539 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
540 getCreateTableArgv(false, moreArgs1), new CreateHiveTableTool());
541 fail(TABLE_NAME + " test should have thrown IOException");
542 } catch (IOException ioe) {
543 // expected; ok.
544 }
545 }
546 }