SQOOP-319. Support for replacing Hive delimiters.
[sqoop.git] / src / test / com / cloudera / sqoop / hive / TestHiveImport.java
1 /**
2 * Licensed to Cloudera, Inc. under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. Cloudera, Inc. licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 package com.cloudera.sqoop.hive;
20
21 import java.io.BufferedReader;
22 import java.io.File;
23 import java.io.FileReader;
24 import java.io.IOException;
25 import java.util.ArrayList;
26
27 import org.apache.commons.logging.Log;
28 import org.apache.commons.logging.LogFactory;
29 import org.apache.hadoop.conf.Configuration;
30 import org.apache.hadoop.fs.FileSystem;
31 import org.apache.hadoop.fs.Path;
32 import org.junit.Test;
33
34 import com.cloudera.sqoop.SqoopOptions;
35 import com.cloudera.sqoop.SqoopOptions.InvalidOptionsException;
36 import com.cloudera.sqoop.testutil.CommonArgs;
37 import com.cloudera.sqoop.testutil.HsqldbTestServer;
38 import com.cloudera.sqoop.testutil.ImportJobTestCase;
39 import com.cloudera.sqoop.tool.BaseSqoopTool;
40 import com.cloudera.sqoop.tool.CodeGenTool;
41 import com.cloudera.sqoop.tool.CreateHiveTableTool;
42 import com.cloudera.sqoop.tool.ImportTool;
43 import com.cloudera.sqoop.tool.SqoopTool;
44 import org.apache.commons.cli.ParseException;
45
46 /**
47 * Test HiveImport capability after an import to HDFS.
48 */
49 public class TestHiveImport extends ImportJobTestCase {
50
51 public static final Log LOG = LogFactory.getLog(
52 TestHiveImport.class.getName());
53
54 /**
55 * Sets the expected number of columns in the table being manipulated
56 * by the test. Under the hood, this sets the expected column names
57 * to DATA_COLi for 0 <= i < numCols.
58 * @param numCols the number of columns to be created.
59 */
60 private void setNumCols(int numCols) {
61 String [] cols = new String[numCols];
62 for (int i = 0; i < numCols; i++) {
63 cols[i] = "DATA_COL" + i;
64 }
65
66 setColNames(cols);
67 }
68
69 /**
70 * Create the argv to pass to Sqoop.
71 * @return the argv as an array of strings.
72 */
73 protected String [] getArgv(boolean includeHadoopFlags, String [] moreArgs) {
74 ArrayList<String> args = new ArrayList<String>();
75
76 if (includeHadoopFlags) {
77 CommonArgs.addHadoopFlags(args);
78 }
79
80 if (null != moreArgs) {
81 for (String arg: moreArgs) {
82 args.add(arg);
83 }
84 }
85
86 args.add("--table");
87 args.add(getTableName());
88 args.add("--warehouse-dir");
89 args.add(getWarehouseDir());
90 args.add("--connect");
91 args.add(HsqldbTestServer.getUrl());
92 args.add("--hive-import");
93 String [] colNames = getColNames();
94 if (null != colNames) {
95 args.add("--split-by");
96 args.add(colNames[0]);
97 } else {
98 fail("Could not determine column names.");
99 }
100
101 args.add("--num-mappers");
102 args.add("1");
103
104 for (String a : args) {
105 LOG.debug("ARG : "+ a);
106 }
107
108 return args.toArray(new String[0]);
109 }
110
111 /**
112 * @return the argv to supply to a code-gen only job for Hive imports.
113 */
114 protected String [] getCodeGenArgs() {
115 ArrayList<String> args = new ArrayList<String>();
116
117 args.add("--table");
118 args.add(getTableName());
119 args.add("--connect");
120 args.add(HsqldbTestServer.getUrl());
121 args.add("--hive-import");
122
123 return args.toArray(new String[0]);
124 }
125
126 /**
127 * @return the argv to supply to a ddl-executing-only job for Hive imports.
128 */
129 protected String [] getCreateHiveTableArgs(String [] extraArgs) {
130 ArrayList<String> args = new ArrayList<String>();
131
132 args.add("--table");
133 args.add(getTableName());
134 args.add("--connect");
135 args.add(HsqldbTestServer.getUrl());
136
137 if (null != extraArgs) {
138 for (String arg : extraArgs) {
139 args.add(arg);
140 }
141 }
142
143 return args.toArray(new String[0]);
144 }
145
146 private SqoopOptions getSqoopOptions(String [] args, SqoopTool tool) {
147 SqoopOptions opts = null;
148 try {
149 opts = tool.parseArguments(args, null, null, true);
150 } catch (Exception e) {
151 fail("Invalid options: " + e.toString());
152 }
153
154 return opts;
155 }
156
157 private void runImportTest(String tableName, String [] types,
158 String [] values, String verificationScript, String [] args,
159 SqoopTool tool) throws IOException {
160
161 // create a table and populate it with a row...
162 createTableWithColTypes(types, values);
163
164 // set up our mock hive shell to compare our generated script
165 // against the correct expected one.
166 SqoopOptions options = getSqoopOptions(args, tool);
167 String hiveHome = options.getHiveHome();
168 assertNotNull("hive.home was not set", hiveHome);
169 Path testDataPath = new Path(new Path(hiveHome),
170 "scripts/" + verificationScript);
171 System.setProperty("expected.script", testDataPath.toString());
172
173 // verify that we can import it correctly into hive.
174 runImport(tool, args);
175 }
176
177 /** Test that we can generate a file containing the DDL and not import. */
178 @Test
179 public void testGenerateOnly() throws IOException {
180 final String TABLE_NAME = "GenerateOnly";
181 setCurTableName(TABLE_NAME);
182 setNumCols(1);
183
184 // Figure out where our target generated .q file is going to be.
185 SqoopOptions options = getSqoopOptions(getArgv(false, null),
186 new ImportTool());
187 Path ddlFile = new Path(new Path(options.getCodeOutputDir()),
188 TABLE_NAME + ".q");
189 FileSystem fs = FileSystem.getLocal(new Configuration());
190
191 // If it's already there, remove it before running the test to ensure
192 // that it's the current test that generated the file.
193 if (fs.exists(ddlFile)) {
194 if (!fs.delete(ddlFile, false)) {
195 LOG.warn("Could not delete previous ddl file: " + ddlFile);
196 }
197 }
198
199 // Run a basic import, but specify that we're just generating definitions.
200 String [] types = { "INTEGER" };
201 String [] vals = { "42" };
202 runImportTest(TABLE_NAME, types, vals, null, getCodeGenArgs(),
203 new CodeGenTool());
204
205 // Test that the generated definition file exists.
206 assertTrue("Couldn't find expected ddl file", fs.exists(ddlFile));
207
208 Path hiveImportPath = new Path(new Path(options.getWarehouseDir()),
209 TABLE_NAME);
210 assertFalse("Import actually happened!", fs.exists(hiveImportPath));
211 }
212
213
214 /** Test that strings and ints are handled in the normal fashion. */
215 @Test
216 public void testNormalHiveImport() throws IOException {
217 final String TABLE_NAME = "NORMAL_HIVE_IMPORT";
218 setCurTableName(TABLE_NAME);
219 setNumCols(3);
220 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
221 String [] vals = { "'test'", "42", "'somestring'" };
222 runImportTest(TABLE_NAME, types, vals, "normalImport.q",
223 getArgv(false, null), new ImportTool());
224 }
225
226 /** Test that table is created in hive with no data import. */
227 @Test
228 public void testCreateOnlyHiveImport() throws IOException {
229 final String TABLE_NAME = "CREATE_ONLY_HIVE_IMPORT";
230 setCurTableName(TABLE_NAME);
231 setNumCols(3);
232 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
233 String [] vals = { "'test'", "42", "'somestring'" };
234 runImportTest(TABLE_NAME, types, vals,
235 "createOnlyImport.q", getCreateHiveTableArgs(null),
236 new CreateHiveTableTool());
237 }
238
239 /**
240 * Test that table is created in hive and replaces the existing table if
241 * any.
242 */
243 @Test
244 public void testCreateOverwriteHiveImport() throws IOException {
245 final String TABLE_NAME = "CREATE_OVERWRITE_HIVE_IMPORT";
246 setCurTableName(TABLE_NAME);
247 setNumCols(3);
248 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
249 String [] vals = { "'test'", "42", "'somestring'" };
250 String [] extraArgs = {"--hive-overwrite", "--create-hive-table"};
251 runImportTest(TABLE_NAME, types, vals,
252 "createOverwriteImport.q", getCreateHiveTableArgs(extraArgs),
253 new CreateHiveTableTool());
254 }
255
256 /** Test that dates are coerced properly to strings. */
257 @Test
258 public void testDate() throws IOException {
259 final String TABLE_NAME = "DATE_HIVE_IMPORT";
260 setCurTableName(TABLE_NAME);
261 setNumCols(2);
262 String [] types = { "VARCHAR(32)", "DATE" };
263 String [] vals = { "'test'", "'2009-05-12'" };
264 runImportTest(TABLE_NAME, types, vals, "dateImport.q",
265 getArgv(false, null), new ImportTool());
266 }
267
268 /** Test that NUMERICs are coerced to doubles. */
269 @Test
270 public void testNumeric() throws IOException {
271 final String TABLE_NAME = "NUMERIC_HIVE_IMPORT";
272 setCurTableName(TABLE_NAME);
273 setNumCols(2);
274 String [] types = { "NUMERIC", "CHAR(64)" };
275 String [] vals = { "3.14159", "'foo'" };
276 runImportTest(TABLE_NAME, types, vals, "numericImport.q",
277 getArgv(false, null), new ImportTool());
278 }
279
280 /** If bin/hive returns an error exit status, we should get an IOException. */
281 @Test
282 public void testHiveExitFails() {
283 // The expected script is different than the one which would be generated
284 // by this, so we expect an IOException out.
285 final String TABLE_NAME = "FAILING_HIVE_IMPORT";
286 setCurTableName(TABLE_NAME);
287 setNumCols(2);
288 String [] types = { "NUMERIC", "CHAR(64)" };
289 String [] vals = { "3.14159", "'foo'" };
290 try {
291 runImportTest(TABLE_NAME, types, vals, "failingImport.q",
292 getArgv(false, null), new ImportTool());
293 // If we get here, then the run succeeded -- which is incorrect.
294 fail("FAILING_HIVE_IMPORT test should have thrown IOException");
295 } catch (IOException ioe) {
296 // expected; ok.
297 }
298 }
299
300 /** Test that we can set delimiters how we want them. */
301 @Test
302 public void testCustomDelimiters() throws IOException {
303 final String TABLE_NAME = "CUSTOM_DELIM_IMPORT";
304 setCurTableName(TABLE_NAME);
305 setNumCols(3);
306 String [] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
307 String [] vals = { "'test'", "42", "'somestring'" };
308 String [] extraArgs = {
309 "--fields-terminated-by", ",",
310 "--lines-terminated-by", "|",
311 };
312 runImportTest(TABLE_NAME, types, vals, "customDelimImport.q",
313 getArgv(false, extraArgs), new ImportTool());
314 }
315
316 /**
317 * Test hive import with row that has new line in it.
318 */
319 @Test
320 public void testFieldWithHiveDelims() throws IOException,
321 InterruptedException {
322 final String TABLE_NAME = "FIELD_WITH_NL_HIVE_IMPORT";
323
324 LOG.info("Doing import of single row into FIELD_WITH_NL_HIVE_IMPORT table");
325 setCurTableName(TABLE_NAME);
326 setNumCols(3);
327 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
328 String[] vals = { "'test with \n new lines \n'", "42",
329 "'oh no " + '\01' + " field delims " + '\01' + "'", };
330 String[] moreArgs = { "--"+ BaseSqoopTool.HIVE_DROP_DELIMS_ARG };
331
332 runImportTest(TABLE_NAME, types, vals, "fieldWithNewlineImport.q",
333 getArgv(false, moreArgs), new ImportTool());
334
335 LOG.info("Validating data in single row is present in: "
336 + "FIELD_WITH_NL_HIVE_IMPORT table");
337
338 // Ideally, we would actually invoke hive code to verify that record with
339 // record and field delimiters have values replaced and that we have the
340 // proper number of hive records. Unfortunately, this is a non-trivial task,
341 // and better dealt with at an integration test level
342 //
343 // Instead, this assumes the path of the generated table and just validate
344 // map job output.
345
346 // Get and read the raw output file
347 String whDir = getWarehouseDir();
348 File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
349 File f = new File(p.toString());
350 FileReader fr = new FileReader(f);
351 BufferedReader br = new BufferedReader(fr);
352 try {
353 // verify the output
354 assertEquals(br.readLine(), "test with new lines " + '\01' + "42"
355 + '\01' + "oh no field delims ");
356 assertEquals(br.readLine(), null); // should only be one line
357 } catch (IOException ioe) {
358 fail("Unable to read files generated from hive");
359 } finally {
360 br.close();
361 }
362 }
363
364 /**
365 * Test hive import with row that has new line in it.
366 */
367 @Test
368 public void testFieldWithHiveDelimsReplacement() throws IOException,
369 InterruptedException {
370 final String TABLE_NAME = "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT";
371
372 LOG.info("Doing import of single row into "
373 + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
374 setCurTableName(TABLE_NAME);
375 setNumCols(3);
376 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
377 String[] vals = { "'test with\nnew lines\n'", "42",
378 "'oh no " + '\01' + " field delims " + '\01' + "'", };
379 String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " "};
380
381 runImportTest(TABLE_NAME, types, vals,
382 "fieldWithNewlineReplacementImport.q", getArgv(false, moreArgs),
383 new ImportTool());
384
385 LOG.info("Validating data in single row is present in: "
386 + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
387
388 // Ideally, we would actually invoke hive code to verify that record with
389 // record and field delimiters have values replaced and that we have the
390 // proper number of hive records. Unfortunately, this is a non-trivial task,
391 // and better dealt with at an integration test level
392 //
393 // Instead, this assumes the path of the generated table and just validate
394 // map job output.
395
396 // Get and read the raw output file
397 String whDir = getWarehouseDir();
398 File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
399 File f = new File(p.toString());
400 FileReader fr = new FileReader(f);
401 BufferedReader br = new BufferedReader(fr);
402 try {
403 // verify the output
404 assertEquals(br.readLine(), "test with new lines " + '\01' + "42"
405 + '\01' + "oh no field delims ");
406 assertEquals(br.readLine(), null); // should only be one line
407 } catch (IOException ioe) {
408 fail("Unable to read files generated from hive");
409 } finally {
410 br.close();
411 }
412 }
413
414 /**
415 * Test hive drop and replace option validation.
416 */
417 @Test
418 public void testHiveDropAndReplaceOptionValidation() throws ParseException {
419 LOG.info("Testing conflicting Hive delimiter drop/replace options");
420
421 setNumCols(3);
422 String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " ",
423 "--"+BaseSqoopTool.HIVE_DROP_DELIMS_ARG, };
424
425 ImportTool tool = new ImportTool();
426 try {
427 tool.validateOptions(tool.parseArguments(getArgv(false, moreArgs), null,
428 null, true));
429 fail("Expected InvalidOptionsException");
430 } catch (InvalidOptionsException ex) {
431 /* success */
432 }
433 }
434
435 /**
436 * Test hive import with row that has new line in it.
437 */
438 @Test
439 public void testImportHiveWithPartitions() throws IOException,
440 InterruptedException {
441 final String TABLE_NAME = "PARTITION_HIVE_IMPORT";
442
443 LOG.info("Doing import of single row into PARTITION_HIVE_IMPORT table");
444 setCurTableName(TABLE_NAME);
445 setNumCols(3);
446 String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", };
447 String[] vals = { "'whoop'", "42", "'I am a row in a partition'", };
448 String[] moreArgs = { "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG, "ds",
449 "--" + BaseSqoopTool.HIVE_PARTITION_VALUE_ARG, "20110413", };
450
451 runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
452 getArgv(false, moreArgs), new ImportTool());
453 }
454
455 }