Skip to content

Commit 7c11abd

Browse files
WIP: Bulk V2
1 parent 6421fee commit 7c11abd

File tree

14 files changed

+884
-66
lines changed

14 files changed

+884
-66
lines changed

contrib/datawave-quickstart/bin/services/datawave/bootstrap-ingest.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ DW_DATAWAVE_INGEST_HOME="${DW_CLOUD_HOME}/${DW_DATAWAVE_INGEST_SYMLINK}"
99
# ingest reducers. Set to 1 for standalone instance, but typically set to the first prime number that is less than the
1010
# number of available Accumulo tablet servers...
1111

12-
DW_DATAWAVE_INGEST_NUM_SHARDS=${DW_DATAWAVE_INGEST_NUM_SHARDS:-1}
12+
DW_DATAWAVE_INGEST_NUM_SHARDS=${DW_DATAWAVE_INGEST_NUM_SHARDS:-10}
1313

1414
# Ingest job logs, etc
1515

@@ -39,7 +39,7 @@ DW_DATAWAVE_INGEST_FLAGFILE_DIR="${DW_DATAWAVE_DATA_DIR}/flags"
3939

4040
# Comma-delimited list of configs for the FlagMaker process(es)
4141

42-
DW_DATAWAVE_INGEST_FLAGMAKER_CONFIGS=${DW_DATAWAVE_INGEST_FLAGMAKER_CONFIGS:-"${DW_DATAWAVE_INGEST_CONFIG_HOME}/flag-maker-live.xml"}
42+
DW_DATAWAVE_INGEST_FLAGMAKER_CONFIGS=${DW_DATAWAVE_INGEST_FLAGMAKER_CONFIGS:-"${DW_DATAWAVE_INGEST_CONFIG_HOME}/flag-maker-live.xml,${DW_DATAWAVE_INGEST_CONFIG_HOME}/flag-maker-bulk.xml"}
4343

4444
# Dir for ingest-related 'pid' files
4545

@@ -72,7 +72,7 @@ DW_DATAWAVE_INGEST_LIVE_DATA_TYPES=${DW_DATAWAVE_INGEST_LIVE_DATA_TYPES:-"wikipe
7272

7373
# Comma-delimited data type identifiers to be ingested via "bulk" ingest, ie via bulk import of RFiles into Accumulo tables
7474

75-
DW_DATAWAVE_INGEST_BULK_DATA_TYPES=${DW_DATAWAVE_INGEST_BULK_DATA_TYPES:-"shardStats"}
75+
DW_DATAWAVE_INGEST_BULK_DATA_TYPES=${DW_DATAWAVE_INGEST_BULK_DATA_TYPES:-"shardStats,wikipedia,mycsv,myjson"}
7676

7777
DW_DATAWAVE_MAPRED_INGEST_OPTS=${DW_DATAWAVE_MAPRED_INGEST_OPTS:-"-useInlineCombiner -ingestMetricsDisabled"}
7878

contrib/datawave-quickstart/bin/services/datawave/install-ingest.sh

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,14 @@ tar xf "${DW_DATAWAVE_SERVICE_DIR}/${DW_DATAWAVE_INGEST_DIST}" -C "${TARBALL_BAS
3030

3131
info "DataWave Ingest tarball extracted and symlinked"
3232

33+
source "${THIS_DIR}/fix-hadoop-classpath.sh"
34+
3335
if ! hadoopIsRunning ; then
3436
info "Starting Hadoop, so that we can initialize Accumulo"
3537
hadoopStart
3638
fi
3739

38-
# Create any Hadoop directories related to Datawave Ingest
40+
# Create any Hadoop directories needed for live ingest input
3941
if [[ -n "${DW_DATAWAVE_INGEST_LIVE_DATA_TYPES}" ]] ; then
4042

4143
OLD_IFS="${IFS}"
@@ -44,10 +46,25 @@ if [[ -n "${DW_DATAWAVE_INGEST_LIVE_DATA_TYPES}" ]] ; then
4446
IFS="${OLD_IFS}"
4547

4648
for dir in "${HDFS_RAW_INPUT_DIRS[@]}" ; do
49+
# Dirs created here should be configured in your live flag maker config (e.g., in config/flag-maker-live.xml)
4750
hdfs dfs -mkdir -p "${DW_DATAWAVE_INGEST_HDFS_BASEDIR}/${dir}" || fatal "Failed to create HDFS directory: ${dir}"
4851
done
4952
fi
5053

54+
# Create any Hadoop directories needed for bulk ingest input
55+
if [[ -n "${DW_DATAWAVE_INGEST_BULK_DATA_TYPES}" ]] ; then
56+
57+
OLD_IFS="${IFS}"
58+
IFS=","
59+
HDFS_RAW_INPUT_DIRS=( ${DW_DATAWAVE_INGEST_BULK_DATA_TYPES} )
60+
IFS="${OLD_IFS}"
61+
62+
for dir in "${HDFS_RAW_INPUT_DIRS[@]}" ; do
63+
# Dirs created here should be configured in your bulk flag maker config (e.g., in config/flag-maker-bulk.xml)
64+
hdfs dfs -mkdir -p "${DW_DATAWAVE_INGEST_HDFS_BASEDIR}/${dir}-bulk" || fatal "Failed to create HDFS directory: ${dir}-bulk"
65+
done
66+
fi
67+
5168
#----------------------------------------------------------
5269
# Configure/update Accumulo classpath, set auths, etc
5370
#----------------------------------------------------------

properties/dev.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ LIVE_CHILD_MAP_MAX_MEMORY_MB=1024
4141
BULK_CHILD_REDUCE_MAX_MEMORY_MB=2048
4242
LIVE_CHILD_REDUCE_MAX_MEMORY_MB=1024
4343

44-
BULK_INGEST_DATA_TYPES=shardStats
44+
BULK_INGEST_DATA_TYPES=shardStats,wikipedia,mycsv,myjson
4545
LIVE_INGEST_DATA_TYPES=wikipedia,mycsv,myjson
4646

4747
# Clear out these values if you do not want standard shard ingest.

warehouse/ingest-configuration/src/main/resources/config/ingest-config.xml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,24 @@
9191
<name>partitioner.default.delegate</name>
9292
<value>datawave.ingest.mapreduce.partition.MultiTableRRRangePartitioner</value>
9393
</property>
94+
95+
<property>
96+
<name>datawave.ingest.splits.cache.dir</name>
97+
<value>${WAREHOUSE_HDFS_NAME_NODE}/data/splitsCache</value>
98+
</property>
99+
100+
<property>
101+
<name>accumulo.config.cache.path</name>
102+
<value>${WAREHOUSE_HDFS_NAME_NODE}/data/accumuloConfigCache/accConfCache.txt</value>
103+
</property>
104+
105+
<property>
106+
<name>ingest.bulk.import.mode</name>
107+
<value>V2_LOAD_PLANNING</value>
108+
<description>
109+
Must be one of [V1, V2_LOCAL_MAPPING, V2_LOAD_PLANNING]
110+
(See BulkIngestMapFileLoader.ImportMode)
111+
</description>
112+
</property>
113+
94114
</configuration>
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
3+
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
4+
5+
<appender name="CONSOLE" class="org.apache.log4j.ConsoleAppender">
6+
<param name="Target" value="System.out"/>
7+
<layout class="org.apache.log4j.PatternLayout">
8+
<param name="ConversionPattern" value="%d{ISO8601} %p [%c{1.}] [%t-%tid] %m%n"/>
9+
</layout>
10+
</appender>
11+
12+
<logger name="org.apache.hadoop">
13+
<level value="info"/>
14+
</logger>
15+
16+
<root>
17+
<priority value="debug"/>
18+
<appender-ref ref="CONSOLE"/>
19+
</root>
20+
</log4j:configuration>

warehouse/ingest-core/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@
1414
<groupId>com.clearspring.analytics</groupId>
1515
<artifactId>stream</artifactId>
1616
</dependency>
17+
<dependency>
18+
<groupId>com.google.code.gson</groupId>
19+
<artifactId>gson</artifactId>
20+
<version>2.9.0</version>
21+
</dependency>
1722
<dependency>
1823
<groupId>com.sun.xml.bind</groupId>
1924
<artifactId>jaxb-impl</artifactId>

warehouse/ingest-core/src/main/java/datawave/ingest/config/BaseHdfsFileCacheUtil.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ public Path createTempFile(FileSystem fs) throws IOException {
130130
do {
131131
Path parentDirectory = this.cacheFilePath.getParent();
132132
String fileName = this.cacheFilePath.getName() + "." + count;
133-
log.info("Attempting to create " + fileName + "under " + parentDirectory);
133+
log.info("Attempting to create " + fileName + " under " + parentDirectory);
134134
tmpCacheFile = new Path(parentDirectory, fileName);
135135
count++;
136136
} while (!fs.createNewFile(tmpCacheFile));

0 commit comments

Comments
 (0)