Big Data Analytics: November 2015

This blogs talks on how to create the Amabri Hive Views in IBM Open Platform.

Step 1: Login to Ambari UI, then go to HDFS --> Config --> Advanced --> Custom core-site

Update or Add the property

hadoop.proxyuser.hive.groups = *

hadoop.proxyuser.root.groups=*

hadoop.proxyuser.root.hosts=*

Save the Configurations and Restart the HDFS.

Step 2: Go to Hive --> Config --> Setting --> Security, set "Run as end user instead of Hive user" as false.

This will internally add the property hive.server2.enable.doAs to false. Save the Configuration and Restart the Hive.

Step 3: Create the directory under /user/admin under HDFS and provide proper permissions.

[root@rvm ~]# su - hdfs

[hdfs@rvm ~]$ hadoop fs -mkdir /user/admin
[hdfs@rvm ~]$ hadoop fs -chown admin:hadoop /user/admin

Step 4: Click on Admin --> Manage Ambari --> Views --> Hive --> Create Instance

Create the instance with the below details and save.

Step 5: Go to the new Hive view Instance and run the queries.

Finally you are able to create Hive Views from Ambari.

This blog talks on - How to build a BigInsights Application to run the Adhoc BigSQL Queries. You will be able to schedule these application so that it will be running on scheduled intervals.

Step 1: Create a BigInsights project in Eclipse

Refer http://big-analytics.blogspot.com.au/2015/07/building-web-crawler-in-ibm-biginsights.html to install Text Analytics Plugin in Eclipse.

Step 2: Update the Oozie workflow.

Here, I create a temp folder in Linux filesystem and create a sql script to store the query. After that, I will be be passing it to JSQSH Client to run the query.

Update the /BigSQLAdhocQuery/BIApp/workflow/workflow.xml

<workflow-app name="wfapp" xmlns="uri:oozie:workflow:0.4">

    <start to="create-folder"/>
        <action name="create-folder">
        <shell xmlns="uri:oozie:shell-action:0.2" >
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <configuration>
                <property>
                    <name>mapred.job.queue.name</name>
                    <value>${queueName}</value>
                </property>
            </configuration>
            <exec>bash</exec>
            <argument>-c</argument>
               <argument>/bin/mkdir -m 777 /tmp/bigSQLAdhocQuery_${wf:id()}</argument>
            <capture-output/>
        </shell>
         <ok to="create-query-file"/>
        <error to="failed-job-cleanup"/>
    </action>
    <action name="create-query-file">
        <shell xmlns="uri:oozie:shell-action:0.2" >
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <configuration>
                <property>
                    <name>mapred.job.queue.name</name>
                    <value>${queueName}</value>
                </property>
            </configuration>
            <exec>bash</exec>
            <argument>-c</argument>
               <argument>/bin/echo "${query}" >> /tmp/bigSQLAdhocQuery_${wf:id()}/test.sql</argument>
            <capture-output/>
        </shell>
        <ok to="run-jsqsh"/>
        <error to="failed-job-cleanup"/>
    </action>
    <action name="run-jsqsh">
        <shell xmlns="uri:oozie:shell-action:0.2" >
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <configuration>
                <property>
                    <name>mapred.job.queue.name</name>
                    <value>${queueName}</value>
                </property>
            </configuration>
            <exec>bash</exec>
            <argument>-c</argument>
               <argument>$JSQSH_HOME/bin/jsqsh -U ${user} -P ${password} -i /tmp/bigSQLAdhocQuery_${wf:id()}/test.sql -n -e ${connectionName}</argument>
            <capture-output/>
        </shell>
        <ok to="cleanup"/>
        <error to="failed-job-cleanup"/>
    </action>

    <action name="failed-job-cleanup">
        <shell xmlns="uri:oozie:shell-action:0.2" >
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <configuration>
                <property>
                    <name>mapred.job.queue.name</name>
                    <value>${queueName}</value>
                </property>
            </configuration>
            <exec>bash</exec>
            <argument>-c</argument>
               <argument>rm -rf /tmp/bigSQLAdhocQuery_${wf:id()}</argument>
            <capture-output/>
        </shell>
        <ok to="fail"/>
        <error to="fail"/>
    </action>

    <action name="cleanup">
        <shell xmlns="uri:oozie:shell-action:0.2" >
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <configuration>
                <property>
                    <name>mapred.job.queue.name</name>
                    <value>${queueName}</value>
                </property>
            </configuration>
            <exec>bash</exec>
            <argument>-c</argument>
               <argument>rm -rf /tmp/bigSQLAdhocQuery_${wf:id()}</argument>
            <capture-output/>
        </shell>
        <ok to="end"/>
        <error to="end"/>
    </action>

    <kill name="fail">
        <message>error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <end name="end"/>
</workflow-app>

Step 3: Update the /BigSQLAdhocQuery/BIApp/workflow/config-default.xml

<property>
        <name>connectionName</name>
        <value>null</value>
    </property>
    <property>
        <name>user</name>
        <value>null</value>
    </property>
    <property>
        <name>password</name>
        <value>null</value>
    </property>
    <property>
        <name>query</name>
        <value>null</value>
    </property>
</configuration>

Step 4: Update the /BigSQLAdhocQuery/BIApp/application/application.xml

<application-template xmlns="http://biginsights.ibm.com/application" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">

    <name>BigSQLAdhocQuery</name>
    <properties>
        <property defaultValue="bigsql" isInputPath="false" isOutputPath="false" isRequired="true" label="JSQSHConnectionName" name="connectionName" paramtype="STRING" uitype="textfield"/>
        <property isInputPath="false" isOutputPath="false" isRequired="true" label="User" name="user" paramtype="STRING" uitype="textfield"/>
        <property isInputPath="false" isOutputPath="false" isRequired="true" label="Password" name="password" paramtype="PASSWORD" uitype="textfield"/>
        <property isInputPath="false" isOutputPath="false" isRequired="true" label="Query" name="query" paramtype="TEXTAREA" uitype="textfield"/>
    </properties>
    <assets>
        <asset id="BigSQLAdhocQuery" type="WORKFLOW"/>
    </assets>
    <imagePath>defaultApp_L.png</imagePath>
</application-template>

Step 5: Publish the BigInsights Application.

Refer http://big-analytics.blogspot.com.au/2015/07/building-web-crawler-in-ibm-biginsights.html for steps to publish the application.