1.obtain apache-maven-3.6.3-bin.tar.gz from the apache website https://maven.apache.org/download.cgi
2 extract the tar into the /app directory
[root@r01edge app] gunzip apache-maven-3.6.3-bin.tar.gz
[root@r01edge app] tar xvf apache-maven-3.6.3-bin.tar
[root@r01edge app] mv apache-maven-3.6.3-bin maven
[root@r01edge app] cd maven/bin
Create a pom template for scala spark development
[root@r01edge bin] vi pom_template.xml
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.cba.spark</groupId>
<packaging>jar</packaging>
<artifactId>PROJECT</artifactId>
<version>1.0.0-SNAPSHOT</version>
<distributionManagement>
<repository>
<id>nexus</id>
<name>maven-releases</name>
<url>http://192.168.2.22:8081/repository/maven-releases/</url>
</repository>
<snapshotRepository>
<id>nexus</id>
<name>maven-snapshots</name>
<url>http://192.168.2.22:8081/repository/maven-snapshots/</url>
</snapshotRepository>
</distributionManagement>
<properties>
<encoding>UTF-8</encoding>
<scala.version>2.11.11</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.11</artifactId>
<version>3.0.4</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>com.cba.spark</groupId>
<artifactId>cbaSpark</artifactId>
<version>1.0.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<version>1.0</version>
<configuration>
</configuration>
<executions>
<execution>
<id>test</id>
<goals>
<goal>test</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.1.2</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>com.cba.spark.PROJECT</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</project>
[root@r01edge bin]# vi run.sh
#!/bin/sh
VER=1.0.0-SNAPSHOT
PROJECT=your_project_name_here
JAR=target/${PROJECT}-${VER}.jar
param1="/user/dataexplorer1/settings.ini"
param2="DEBUG"
nohup spark-submit --driver-memory 2G --jars cbaSpark.jar --conf spark.dynamicAllocation.maxExecutors=5 --num-executors 5 --executor-cores 6 --executor-memory 15G --queue root.users.dataexplorer1 ${JAR} ${param1} ${param2} 1>${PROJECT}_result.txt 2>${PROJECT}_run.log &
[root@r01edge bin]# vi template.scala
//File : PROJECT.scala
//Author : Boris Alexandrov <boris.alexandrov@custom-built-apps.com>
//Date : FILE_DATE
//Project : PROJECT implementation
//Version : v1.0
//Revision :initial version
////////////////////////////////////////////
package com.cba.spark
import org.apache.spark.sql.{Row, SaveMode, SparkSession}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql._
import java.util.Calendar
import scala.concurrent.forkjoin._
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
import scala.concurrent.{Await, Future}
object PROJECT
{
def main(args: Array[String])
{
var cbaspark:cbaSpark = null;
val configFile=args(0)
val debugOn=args(1)
try
{
var sparkProperties="hive.metastore.uris=thrift://datanode1.custom-built-apps.com:9083;"
sparkProperties += "spark.driver.maxResultSize=0;spark.sql.session.timeZone=Canada/Eastern;"
sparkProperties += "hive.exec.dynamic.partition=true;hive.exec.dynamic.partition.mode=nonstrict;"
cbaspark= new cbaSpark("PROJECT",configFile,sparkProperties)
cbaspark.printStart
cbaspark.printSettings
val source_db=cbaspark.variables("source_db")
val stage_db=cbaspark.variables("stage_db")
val target_db=cbaspark.variables("target_db")
val warehouseLocation=cbaspark.variables("warehouseLocation")
cbaspark= new cbaSpark("PROJECT",configFile,sparkProperties)
cbaspark.printStart
cbaspark.printSettings
val tableName=s"${stage_db}.PROJECT"
val dataPath=s"${warehouseLocation}/${tableName}"
val location=s""" location '${dataPath}'"""
// drop the table
cbaspark.dropTable(tableName,dataPath)
/* REMOVE THE COMMENTS
val strPROJECT_tmp=s"""select * from ${stage_db}.sometable
"""
val dfPROJECT_tmp = cbaspark.execHiveQuery(strPROJECT_tmp)
cbaspark.createView(dfPROJECT_tmp,"PROJECT_tmp")
val df=cbaspark.execHiveQuery("select * from PROJECT_tmp")
df.show(false)
//create an external table (location -> marker for external)
val strPROJECT=s"""create table ${tableName}
stored as parquet
${location}
tblproperties('parquet.compress'='SNAPPY')
as
"""
cbaspark.execHQL(strPROJECT)
REMOVE THE COMMENTS
*/
//////////////////////// TESTING CODE , SHOULD BE REMOVED FROM THE FINAL ///////////////////
/////////////// change the number of columns(0 based), expects all columns being strings ///
val strTest=s"""select * from ${cbadb1}.test"""
val dfTest=cbaspark.execHiveQuery(strTest)
var i=0
val columnames=dfTest.columns;
dfTest.take(10).foreach
{
row=>
i=i+1
println(s"""_____ record ${i} __________""" )
for(n <-0 to 4 ) //5 columns 10 records
{
println(columnames(n) +": " + row.getString(n))
}
}
println(s"""Total number of records generated is ${dfTest.count}.""")
/* //time consuming, enable when needed
val numDistinctValues=dfTest.agg(countDistinct("id")).collect()(0)(0)
println(s"""Total number of distinct records generated is ${numDistinctValues}.""")
*/
/////////////////////////////END OF TESTING CODE /////////////////////////////////
} //end of try
catch
{
case e : Exception => println("[ERROR] " + e)
}
finally
{
cbaspark.stop
cbaspark.printEnd
}
} //end of main
} //end of Class
Create a script generating maven directory structure for scala development:
[root@r01edge bin]# vi mkpr.sh
# File : mkpr.sh
# Author : Boris Alexandrov
# Date : December 2018
# Purpose : generating scala directory structure for maven
#################################################################
#!/bin/sh
if [ $# -eq 0 ]
then
echo "***********************************************"
echo "No arguments supplied, Project name is expected"
echo "***********************************************"
exit 2
fi
MAVEN_HOME=/app/maven
PROJECT=$1
mvn archetype:generate -DgroupId=com.cba.spark -DartifactId=${PROJECT} -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false
cd ${PROJECT}
cp ${MAVEN_HOME}/bin/pom_template.xml ./pom.xml
sed -i "s/PROJECT/${PROJECT}/g" pom.xml
ln -s ~/projects/lib/cbaSpark.jar cbaSpark.jar
cp ${MAVEN_HOME}/bin/run.sh ./run.sh
sed -i "s/your_project_name_here/${PROJECT}/g" run.sh
chmod 755 ./run.sh
cd src
rm -rf test
cd main
mv java scala
cd scala/com/cba/spark
rm -rf *.java
cp ${MAVEN_HOME}/bin/template.scala ${PROJECT}.scala
sed -i "s/PROJECT/${PROJECT}/g" ${PROJECT}.scala
file_date=$(date)
sed -i "s/FILE_DATE/${file_date}/g" ${PROJECT}.scala
echo "${PROJECT} directory structure has been created"
[root@r01edge bin]# chmod 755 mkpr.sh
[root@r01edge bin]# cd ../
[root@r01edge maven]# chmod 755 bin
Create settings file for maven for user dataexplorer1
[root@r01edge bin]# su – dataexplorer1
[dataexplorer1@r01edge ~]$ mkdir .m2
[dataexplorer1@r01edge ~]$ cd .m2
[dataexplorer1@r01edge .m2]$ vi settings.xml
<?xml version="1.0" encoding="UTF-8"?>
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">
<servers>
<server>
<id>nexus</id>
<username>dataexplorer1</username>
<password>${env.MAVEN_REPO_PASSWORD}</password>
</server>
<server>
<id>maven.oracle.com</id>
<username>boris.alexandrov@hotmail.ca</username>
<password>${env.MAVEN_REPO_PASSWORD}</password>
<configuration>
<basicAuthScope>
<host>ANY</host>
<port>ANY</port>
<realm>OAM 11g</realm>
</basicAuthScope>
<httpConfiguration>
<all>
<params>
<property>
<name>http.protocol.allow-circular-redirects</name>
<value>%b,true</value>
</property>
</params>
</all>
</httpConfiguration>
</configuration>
</server>
</servers>
<mirrors>
<mirror>
<id>nexus</id>
<mirrorOf>central</mirrorOf>
<name>Mirror for Maven Central</name>
<url>http://192.168.2.22:8081/repository/maven-central/</url>
</mirror>
</mirrors>
<profiles>
<profile>
<id>cbadev</id>
<repositories>
<repository>
<id>central</id>
<url>http://central</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<repository>
<id>maven.oracle.com</id>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
<url>https://maven.oracle.com</url>
<layout>default</layout>
</repository>
<repository>
<releases />
<snapshots />
<id>maven-releases</id>
<url>http://192.168.2.22:8081/repository/maven-releases/</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>central</id>
<url>http://central</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</pluginRepository>
<pluginRepository>
<id>maven.oracle.com</id>
<url>https://maven.oracle.com</url>
</pluginRepository>
</pluginRepositories>
</profile>
</profiles>
<activeProfiles>
<activeProfile>cbadev</activeProfile>
</activeProfiles>
</settings>
In your environment set the paths and password for nexus:
PATH=${PATH}:${HOME}/bin:/app/maven/bin
export PATH
export path=src/main/scala/com/cba/spark
export MAVEN_REPO_PASSWORD=ThEpAsSword
NOW creating a scala based project for spark-submit will look as follows:
[dataexplorer1@r01edge scala]$ mkpr.sh cbaTest
[dataexplorer1@r01edge scala]$ cd cbaTest
the scala source files will be located in the directory src/main/scala/com/cba/spark
[dataexplorer1@r01edge scala]$ ls -l src/main/scala/com/cba/spark
[dataexplorer1@r01edge scala]$ ls -l $path
Building a snapshot will be as easy as that:
[dataexplorer1@r01edge cbaTest]$ mvn package
Building and deploying snapshot into the nexus maven-snapshots repository:
[dataexplorer1@r01edge cbaTest]$ mvn clean deploy
Building and deploying a release into nexus maven-releases repository:
- remove the word SNAPSHOT from pom.xml so that only version numbers are there
- 2. [dataexplorer1@r01edge cbaTest]$ mvn clean deploy