cbaWorkflow : Installing and configuring maven

1.obtain apache-maven-3.6.3-bin.tar.gz from the apache website https://maven.apache.org/download.cgi

2 extract the tar into the /app directory

[root@r01edge app] gunzip apache-maven-3.6.3-bin.tar.gz

[root@r01edge app] tar xvf apache-maven-3.6.3-bin.tar

[root@r01edge app] mv apache-maven-3.6.3-bin maven

[root@r01edge app] cd maven/bin

Create a pom template for scala spark development

[root@r01edge bin] vi pom_template.xml

<project xmlns="http://maven.apache.org/POM/4.0.0"

xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

<groupId>com.cba.spark</groupId>

<artifactId>PROJECT</artifactId>

<version>1.0.0-SNAPSHOT</version>

<id>nexus</id>

<name>maven-releases</name>

<url>http://192.168.2.22:8081/repository/maven-releases/</url>

</repository>

<id>nexus</id>

<name>maven-snapshots</name>

<url>http://192.168.2.22:8081/repository/maven-snapshots/</url>

</snapshotRepository>

</distributionManagement>

<scala.version>2.11.11</scala.version>

</properties>

<groupId>org.scala-lang</groupId>

<artifactId>scala-library</artifactId>

<version>${scala.version}</version>

</dependency>

<groupId>org.scalatest</groupId>

<artifactId>scalatest_2.11</artifactId>

<scope>compile</scope>

</dependency>

<groupId>org.apache.spark</groupId>

<artifactId>spark-sql_2.11</artifactId>

</dependency>

<groupId>com.cba.spark</groupId>

<artifactId>cbaSpark</artifactId>

</dependency>

</dependencies>

<build>

<groupId>org.scala-tools</groupId>

<artifactId>maven-scala-plugin</artifactId>

<goals>

<goal>compile</goal>

<goal>testCompile</goal>

</goals>

</execution>

</executions>

</plugin>

<groupId>org.scalatest</groupId>

<artifactId>scalatest-maven-plugin</artifactId>

</configuration>

<goals>

</goals>

</execution>

</executions>

</plugin>

<groupId>org.apache.maven.plugins</groupId>

<artifactId>maven-jar-plugin</artifactId>

<mainClass>com.cba.spark.PROJECT</mainClass>

</manifest>

</archive>

</configuration>

</plugin>

</plugins>

</build>

</project>

[root@r01edge bin]# vi run.sh

#!/bin/sh

VER=1.0.0-SNAPSHOT

PROJECT=your_project_name_here

JAR=target/${PROJECT}-${VER}.jar

param1="/user/dataexplorer1/settings.ini"

param2="DEBUG"

nohup spark-submit --driver-memory 2G --jars cbaSpark.jar --conf spark.dynamicAllocation.maxExecutors=5 --num-executors 5 --executor-cores 6 --executor-memory 15G --queue root.users.dataexplorer1 ${JAR} ${param1} ${param2} 1>${PROJECT}_result.txt 2>${PROJECT}_run.log &

[root@r01edge bin]# vi template.scala

//File : PROJECT.scala

//Author : Boris Alexandrov <boris.alexandrov@custom-built-apps.com>

//Date : FILE_DATE

//Project : PROJECT implementation

//Version : v1.0

//Revision :initial version

////////////////////////////////////////////

package com.cba.spark

import org.apache.spark.sql.{Row, SaveMode, SparkSession}

import org.apache.spark.sql.expressions.Window

import org.apache.spark.sql.functions._

import org.apache.spark.sql.types.IntegerType

import org.apache.spark.sql._

import java.util.Calendar

import scala.concurrent.forkjoin._

import scala.concurrent.ExecutionContext.Implicits.global

import scala.concurrent.duration._

import scala.concurrent.{Await, Future}

object PROJECT
{
def main(args: Array[String])
{
var cbaspark:cbaSpark = null;
val configFile=args(0)
val debugOn=args(1)
try
{

var sparkProperties="hive.metastore.uris=thrift://datanode1.custom-built-apps.com:9083;"
sparkProperties += "spark.driver.maxResultSize=0;spark.sql.session.timeZone=Canada/Eastern;"
sparkProperties += "hive.exec.dynamic.partition=true;hive.exec.dynamic.partition.mode=nonstrict;"

cbaspark= new cbaSpark("PROJECT",configFile,sparkProperties)
cbaspark.printStart
cbaspark.printSettings

val source_db=cbaspark.variables("source_db")
val stage_db=cbaspark.variables("stage_db")
val target_db=cbaspark.variables("target_db")
val warehouseLocation=cbaspark.variables("warehouseLocation")

cbaspark= new cbaSpark("PROJECT",configFile,sparkProperties)
cbaspark.printStart
cbaspark.printSettings

val tableName=s"${stage_db}.PROJECT"
val dataPath=s"${warehouseLocation}/${tableName}"
val location=s""" location '${dataPath}'"""

// drop the table

cbaspark.dropTable(tableName,dataPath)

/* REMOVE THE COMMENTS

val strPROJECT_tmp=s"""select * from ${stage_db}.sometable

"""

val dfPROJECT_tmp = cbaspark.execHiveQuery(strPROJECT_tmp)

cbaspark.createView(dfPROJECT_tmp,"PROJECT_tmp")

val df=cbaspark.execHiveQuery("select * from PROJECT_tmp")

df.show(false)

//create an external table (location -> marker for external)

val strPROJECT=s"""create table ${tableName}

stored as parquet

${location}

tblproperties('parquet.compress'='SNAPPY')

as

"""

cbaspark.execHQL(strPROJECT)

REMOVE THE COMMENTS

*/

//////////////////////// TESTING CODE , SHOULD BE REMOVED FROM THE FINAL ///////////////////

/////////////// change the number of columns(0 based), expects all columns being strings ///

val strTest=s"""select * from ${cbadb1}.test"""

val dfTest=cbaspark.execHiveQuery(strTest)

var i=0

val columnames=dfTest.columns;

dfTest.take(10).foreach

{

row=>

i=i+1

println(s"""_____ record ${i} __________""" )

for(n <-0 to 4 ) //5 columns 10 records

{

println(columnames(n) +": " + row.getString(n))

}

println(s"""Total number of records generated is ${dfTest.count}.""")

/* //time consuming, enable when needed

val numDistinctValues=dfTest.agg(countDistinct("id")).collect()(0)(0)

println(s"""Total number of distinct records generated is ${numDistinctValues}.""")

*/

/////////////////////////////END OF TESTING CODE /////////////////////////////////

} //end of try

catch

{

case e : Exception => println("[ERROR] " + e)

}

finally

{

cbaspark.stop

cbaspark.printEnd

}

} //end of main

} //end of Class

Create a script generating maven directory structure for scala development:

[root@r01edge bin]# vi mkpr.sh

# File : mkpr.sh

# Author : Boris Alexandrov

# Date : December 2018

# Purpose : generating scala directory structure for maven

#################################################################

#!/bin/sh

if [ $# -eq 0 ]

then

echo "***********************************************"

echo "No arguments supplied, Project name is expected"

echo "***********************************************"

exit 2

fi

MAVEN_HOME=/app/maven

PROJECT=$1

mvn archetype:generate -DgroupId=com.cba.spark -DartifactId=${PROJECT} -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false

cd ${PROJECT}

cp ${MAVEN_HOME}/bin/pom_template.xml ./pom.xml

sed -i "s/PROJECT/${PROJECT}/g" pom.xml

ln -s ~/projects/lib/cbaSpark.jar cbaSpark.jar

cp ${MAVEN_HOME}/bin/run.sh ./run.sh

sed -i "s/your_project_name_here/${PROJECT}/g" run.sh

chmod 755 ./run.sh

cd src

rm -rf test

cd main

mv java scala

cd scala/com/cba/spark

rm -rf *.java

cp ${MAVEN_HOME}/bin/template.scala ${PROJECT}.scala

sed -i "s/PROJECT/${PROJECT}/g" ${PROJECT}.scala

file_date=$(date)

sed -i "s/FILE_DATE/${file_date}/g" ${PROJECT}.scala

echo "${PROJECT} directory structure has been created"

[root@r01edge bin]# chmod 755 mkpr.sh

[root@r01edge bin]# cd ../

[root@r01edge maven]# chmod 755 bin

Create settings file for maven for user dataexplorer1

[root@r01edge bin]# su – dataexplorer1

[dataexplorer1@r01edge ~]$ mkdir .m2

[dataexplorer1@r01edge ~]$ cd .m2

[dataexplorer1@r01edge .m2]$ vi settings.xml

<?xml version="1.0" encoding="UTF-8"?>

<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"

xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">

<id>nexus</id>

<username>dataexplorer1</username>

<password>${env.MAVEN_REPO_PASSWORD}</password>

</server>

<id>maven.oracle.com</id>

<username>boris.alexandrov@hotmail.ca</username>

<password>${env.MAVEN_REPO_PASSWORD}</password>

</basicAuthScope>

<all>

<name>http.protocol.allow-circular-redirects</name>

</property>

</params>

</all>

</httpConfiguration>

</configuration>

</server>

</servers>

<id>nexus</id>

<mirrorOf>central</mirrorOf>

<name>Mirror for Maven Central</name>

<url>http://192.168.2.22:8081/repository/maven-central/</url>

</mirror>

</mirrors>

<id>cbadev</id>

<id>central</id>

<url>http://central</url>

</releases>

</snapshots>

</repository>

<id>maven.oracle.com</id>

</releases>

<enabled>false</enabled>

</snapshots>

<url>https://maven.oracle.com</url>

<layout>default</layout>

</repository>

<repository>
<releases />
<snapshots />
<id>maven-releases</id>
<url>http://192.168.2.22:8081/repository/maven-releases/</url>
</repository>

</repositories>

<id>central</id>

<url>http://central</url>

</releases>

</snapshots>

</pluginRepository>

<id>maven.oracle.com</id>

<url>https://maven.oracle.com</url>

</pluginRepository>

</pluginRepositories>

</profile>

</profiles>

<activeProfile>cbadev</activeProfile>

</activeProfiles>

</settings>

In your environment set the paths and password for nexus:

PATH=${PATH}:${HOME}/bin:/app/maven/bin

export PATH

export path=src/main/scala/com/cba/spark

export MAVEN_REPO_PASSWORD=ThEpAsSword

NOW creating a scala based project for spark-submit will look as follows:

[dataexplorer1@r01edge scala]$ mkpr.sh cbaTest

[dataexplorer1@r01edge scala]$ cd cbaTest

the scala source files will be located in the directory src/main/scala/com/cba/spark

[dataexplorer1@r01edge scala]$ ls -l src/main/scala/com/cba/spark

[dataexplorer1@r01edge scala]$ ls -l $path

Building a snapshot will be as easy as that:

[dataexplorer1@r01edge cbaTest]$ mvn package

Building and deploying snapshot into the nexus maven-snapshots repository:

[dataexplorer1@r01edge cbaTest]$ mvn clean deploy

Building and deploying a release into nexus maven-releases repository:

remove the word SNAPSHOT from pom.xml so that only version numbers are there
2. [dataexplorer1@r01edge cbaTest]$ mvn clean deploy