I am a caffeinated, busy software junkie. Daily I help teams with solution engineering solutions

Daily I help teams with solution engineering aspect of connected vehicle data projects. (massive datasets & always some new datasets with new car models aka new technologies.) Lately in the spare time, applying some of the ML/Deep learning techniques on datasets (many are create based on observations of real datasets)To Share some thoughts on my work (main half of this blog) and the other half will be about my family and friends.

Wednesday, February 26, 2014

Personal stock portal… YQL experiments.

In general, Lots of portals provides stock quote information. For example, @finance.yahoo.com, one can give a valid stock ticker & gets the stock information. From this point, if you want to know more about company profile you have to click one link or one more web service call. If you want to know more about last year statistics, you have to click or one more web service call. Couple of years back, after knowing little bit of YQL, I did small POC. Basically given a stock ticker, system will display complete snapshot of the company in one click. It is a fun ride however with all bug in YQL, It remained as POC only.

Tons of code. But I will dump few important methods.

////

String stockSym = "AAPL";

SupplierSection StockInfo = new SupplierSection(stockSym);

StockInfo.createHttpClient();

StockInfo.crawlFinanceInfo(stockSym);//crawlRssInfo

StockInfo.crawlProfileInfo(stockSym);

StockInfo.crawlKeyStats(stockSym);

StockInfo.crawlRssInfo(stockSym);

StockInfo.crawlQuantInfo(stockSym);//crawlRssInfo

//StockInfo.printAllPublicInfo();

try {

//StockInfo.writeToXmlFile();

//StockInfo.writeToXmlFile1(stockSym);

StockInfo.writehtmlFile1(stockSym);

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

///basic compnay info

public void crawlProfileInfo(String stockSym) {

CloseableHttpClient httpclient = HttpClients.createDefault();

try {

GetMethod httpGet = new GetMethod("http://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.stocks%20where%20symbol%3D%22"+stockSym+"%22&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys");

httpGet.setFollowRedirects( true );

System.out.println("executing request in crawlProfileInfo " );

int responseCode = httpClient.executeMethod(httpGet);

byte[] data = httpGet.getResponseBody();

if (responseCode >= 400) {

System.out.println("Failed to send request "+ responseCode);

}else{

String xmlString = new String( data, ENCODING );

//System.out.println("xml string "+ xmlString);

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

DocumentBuilder db = factory.newDocumentBuilder();

InputSource inStream = new InputSource();

inStream.setCharacterStream(new StringReader(xmlString));

Document doc = db.parse(inStream);

String fName = "CompanyName";

profileInfo.put(fName,getFieldValue(doc,fName));

fName = "Sector";

profileInfo.put(fName,getFieldValue(doc,fName));

fName = "Industry";

profileInfo.put(fName,getFieldValue(doc,fName));

fName = "FullTimeEmployees";

profileInfo.put(fName,getFieldValue(doc,fName));

fName = "start";

profileInfo.put(fName,getFieldValue(doc,fName));

fName = "end";

profileInfo.put(fName,getFieldValue(doc,fName));

}

public void crawlKeyStats(String stockSym) {

try {

GetMethod httpGet = new GetMethod("http://query.yahooapis.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.finance.keystats%20WHERE%20symbol%3D'"+stockSym+"'&diagnostics=false&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys");

httpGet.setFollowRedirects( true );

System.out.println("executing request in crawlKeyStats" );

int responseCode = httpClient.executeMethod(httpGet);

byte[] data = httpGet.getResponseBody();

if (responseCode >= 400) {

System.out.println("Failed to send request "+ responseCode);

}else{

String xmlString = new String( data, ENCODING );

System.out.println("xml string "+ xmlString);

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

DocumentBuilder db = factory.newDocumentBuilder();

InputSource inStream = new InputSource();

inStream.setCharacterStream(new StringReader(xmlString));

Document doc = db.parse(inStream);

processKeyStatsResponse(doc);

}

} catch (Exception e) {

e.printStackTrace();

} finally {

}

}

public void crawlRssInfo(String stockSym) {

try {

GetMethod httpGet = new GetMethod("http://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D'http%3A%2F%2Ffinance.yahoo.com%2Fq%3Fs%3D"+stockSym+"'%20and%20xpath%3D'%2F%2Fdiv%5B%40id%3D%22yfi_headlines%22%5D%2Fdiv%5B2%5D%2Ful%2Fli%2Fa'&diagnostics=true");

httpGet.setFollowRedirects( true );

System.out.println("executing request in crawlRssInfo" );

int responseCode = httpClient.executeMethod(httpGet);

byte[] data = httpGet.getResponseBody();

if (responseCode >= 400) {

System.out.println("Failed to send request "+ responseCode);

}else{

String xmlString = new String( data, ENCODING );

//System.out.println("xml string "+ xmlString);

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

DocumentBuilder db = factory.newDocumentBuilder();

InputSource inStream = new InputSource();

inStream.setCharacterStream(new StringReader(xmlString));

Document doc = db.parse(inStream);

processRssResponse(doc);

}

} catch (Exception e) {

e.printStackTrace();

} finally {

}

}

Tuesday, February 25, 2014

Very Old: Java code convert Excel (CSV) file to generate SOLR documents

Scenario is :

Daily mainframe system generates delta of product data (excel contains rows up to one million rows. Search needs to happen across 30+ product fields.) My initial thoughts are use XSLT however after seeing some computation on some columns, I end up writing some Java code. This code generate one SOLR document xml for each Excel row & with using post.jar file, we used to index the data in one go. (Back in SOLR 1.4 days, JSON format was not that popular & so I used xml. I have XSLT version too. Will post that code very soon.)

import javax.xml.parsers.DocumentBuilder;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Attr;

import org.w3c.dom.Document;

import org.w3c.dom.Element;

public class CsvToSolrXmlDoc

{

public String[] columnNames = null;

public String getColumnName(int i)

{

if ( i>columnNames.length -1 ) return "checkInputData";

return columnNames[i];

}

public void initHeaders(String[] cols)

{

columnNames = cols;

}

public void writeProdInfo(String[] cols) throws Exception {

if ( cols.length == 0) return;

String prodId= cols[0];

try {

int idVal = Integer.parseInt(prodId); /// this is my use case. U can ignore this validation

} catch (NumberFormatException e) {

System.out.println("\n Skiping id "+prodId);

return;

}

String outputFile = "c:\\a_edocs" + "\\"+prodId+".xml"

try {

DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();

Document doc = builder.newDocument();

Element addEle = doc.createElement("add");

doc.appendChild(addEle);

Element root = doc.createElement("doc");

addEle.appendChild(root);

String fname = "field";

for ( int i=0; i

String cName = getColumnName(i);

////DO ALL column based post processing

String value = cols[i].trim();

if ( value == null) value = "";

Element node = doc.createElement(fname);

Attr attr = doc.createAttribute("name");

attr.setValue(cName);

node.setAttributeNode(attr);

node.appendChild(doc.createTextNode(value));

root.appendChild(node);

}

DOMSource domSource = new DOMSource(doc);

Transformer transformer = TransformerFactory.newInstance().newTransformer();

transformer.setOutputProperty(OutputKeys.INDENT, "yes");

transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");

StreamResult result = new StreamResult(new File(outputFile));

transformer.transform(domSource, result);

} catch (Exception e) {

e.printStackTrace();

}

}

public void csvToXML(String inputFile) throws java.io.FileNotFoundException, java.io.IOException

{

BufferedReader br = new BufferedReader(new FileReader(inputFile));

StreamTokenizer st = new StreamTokenizer(br);

String line = null;

int i=0;

while ((line = br.readLine()) != null){

String[] values = line.split(",");

if ( i==0){

for ( int j=0; j

System.out.println("\n col index,name:"+j+":"+values[j]);

}

initHeaders(values);i++;

}else{

try {

writeProdInfo(values);

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}

br.close();

}

public static void main(String argv[]) throws java.io.IOException

{

CsvToSolrXmlDoc cp = new CsvToSolrXmlDoc();

cp.csvToXML("C:\\a_work\\three.csv");

}

}

Monday, February 24, 2014

Hierarchal suppler representation & traversal in RDBMS

For example consider organizations like PG, Siemens and GE where the business is organized in multiple verticals and many subsidiaries across the globe. One way of identifying all these units are DUNS numbers. Brief definition of DUNS number is

At least in one standard (Dun & Bradstreet (D&B) provides a D-U-N-S Number,

A unique nine digit identification number, for each physical location of your business.)

Many global manufacturing companies uses this DUNS numbers to organize their suppliers & their subsidiaries. In the sequence of post I will explain relational way of mapping this suppler data to RDBMS and then to Graph database (In this prototype, I am using neo4j as a graph data base.)

In the relational mapping, each suppler entity contains following minimal set of attributes

Supplier primary_duns_number, supplier_name, address and parent duns number.

(In case of root suppler organization, primary_duns_number and parent duns’ number is same)

Following is the ORACLE specific SQL query, uses concepts like CONNECT AND CYCLE etc.,

Which brings all parent rows for given child suppler primary key.

>>

SELECT LEVEL,LPAD (' ', 2 * (LEVEL - 1)) || t1.PRIMARY_DUNS_NUMBER,

t1.SUPPLIER_NAME,

t1.PARENT_DUNS_NUMBER,

t1.address,

SYS_CONNECT_BY_PATH(t1.PRIMARY_DUNS_NUMBER, '/') "Path",

t2.SUPPLIER_NAME AS parent_name,

t2.PARENT_DUNS_NUMBER AS parent_id

FROM GQTS.SUPPLIER t1

LEFT JOIN GQTS.SUPPLIER t2 ON t2.PRIMARY_DUNS_NUMBER = t1.PARENT_DUNS_NUMBER

START WITH t1.PRIMARY_DUNS_NUMBER = 'DUNS_CODE'

CONNECT BY NOCYCLE PRIOR t1.PRIMARY_DUNS_NUMBER = t1.PARENT_DUNS_NUMBER

>>

Sunday, February 23, 2014

Dhanvi with San Antonio chess Trophy

Much needed confidence.
He was playing good however in last few tournaments, he fell short.
But this one, he made a decision, just one a day ahead, & joined with his friends.

Saturday, February 22, 2014

Deploying Solr 4.61 in to weblogic

Assumptions:

Weblogic 10.X is installed & running fine.

download SOLR 4.61 from http://lucene.apache.org/solr/ & expand the zip file.

a) Deploy the war file via console or auto-deploy folder.

You will notice exceptions in weblogic console saying that log jars missing exceptions

b) Now copy log jar files from \solr-4.6.1\example\lib\ext to your weblogic domain's lib

On my local machine, this value is : domains\base_domain\lib

c) After copy & restart weblogic & start accessing solr app.

Now system will display tons of exceptions

(see the SOLR JIRA SOLR-4762 for the issue & resolution.)

What I did is I expanded solr.war file (jar -xvf ...) & updated weblogic.xml file.

Following is my weblogic file

<weblogic-web-app
xmlns="http://www.bea.com/ns/weblogic/90"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.bea.com/ns/weblogic/90 http://www.bea.com/ns/weblogic/90/weblogic-web-app.xsd">

<container-descriptor>
<filter-dispatched-requests-enabled>false</filter-dispatched-requests-enabled> <prefer-web-inf-classes>true</prefer-web-inf-classes>
</container-descriptor>

</weblogic-web-app>

D)Note: If you are starting weblogic with explicit Solr.solr.home parameter, you are good

for example my value is

-Dsolr.solr.home=C:\Oracle\Middleware\user_projects\domains\base_domain\solr

However, if you are trying to set this value via solr\WEB-INF\web.xml changes

then you will face one more issue or it will start at all.

Original web.xml contains



<env-entry>
<env-entry-name>solr/home</env-entry-name>
<env-entry-value>C:\aa_valve_solr\solr</env-entry-value>
<env-entry-type>java.lang.String</env-entry-type>
</env-entry>

Above will not work. Weblogic keep saying Not valid xml file

<env-entry>
<env-entry-name>solr/home</env-entry-name>
<env-entry-type>java.lang.String</env-entry-type>
<env-entry-value>C:\aa_valve_solr\solr</env-entry-value>
</env-entry>

With this change (yes. Ordering of elements make the difference ) & SOLR will start successfully.

From this point, everything runs fine (i.e. indexing/search & i tried data import handlers etc.)

2014 Texas Data day conference

Recently I went to Texas Data day conference & there are many talks around real time processing of data pipes. In particular lot's of talks/frameworks around log file processing,

HDFS for storing information,

Lucene based Search engines for finding information

& open source graphics to render UI.

Everything is open source & massively scalable frameworks.

In particular twitters engineers summingbird talk is crazy & interesting.

Following is above talks speaker deck.

https://speakerdeck.com/sritchie/the-road-to-summingbird-stream-processing-at-every-scale

This is placeholder. I will add more content later.

Example business rules code in Java

I used to do lots of business rules implementation via Drools

A sample Java example is shown below.

This rule allow Claim submission up to 2ed Tuesday of the month.

Tons of util methods like to validate date ranges, returning month integer as String,

finding second Tuesday of a month, First day etc.

I will add Drools integration code separately.

///In case of null input date, logic uses today's date.

static public boolean isValidClaimDate(Date inputDate ) {

Date checkDate = null;

if ( inputDate == null){

Calendar c = Calendar.getInstance();

checkDate = c.getTime();

}else{

checkDate = inputDate;

}

Date firstDay = getFirstDay();

Date secondTuesday = getSecondTuesday();

if (firstDay.before(checkDate) && secondTuesday.after(checkDate)) {

LOGGER.info("Validing current date"+ checkDate + " as true");

return true;

}else{

LOGGER.info("Validing current date"+ checkDate + " as false");

return false;

}

}

///works fine

static public Date getSecondTuesday() {

Calendar c = Calendar.getInstance();

int month = c.get ( Calendar.MONTH );

int year = c.get ( Calendar.YEAR );

Calendar cal = Calendar.getInstance(TimeZone.getTimeZone(Constants.estTimeZone));

cal.set( year, month , 1,18,0);

while (cal.get(Calendar.DAY_OF_WEEK) != Calendar.TUESDAY) {

cal.add(Calendar.DAY_OF_MONTH, 1);

}

//now add 7 more days to get 2ed week

cal.add(Calendar.DAY_OF_MONTH, 7);

return cal.getTime();

}

///check

static public Date getFirstDay() {

Calendar c = Calendar.getInstance();

int month = c.get ( Calendar.MONTH );

int year = c.get ( Calendar.YEAR );

Calendar cal = Calendar.getInstance(TimeZone.getTimeZone(Constants.estTimeZone));

cal.set( year, month ,1);

cal.set(Calendar.HOUR_OF_DAY, 0);

cal.set(Calendar.HOUR, 0);

cal.set(Calendar.MINUTE, 0);

cal.set(Calendar.SECOND, 0);

return cal.getTime();

}

static public boolean validUploadDateRange( int month, int year ) {

Calendar c = Calendar.getInstance(TimeZone.getTimeZone(Constants.estTimeZone));

int currentMonth = c.get ( Calendar.MONTH ); //note: this scale starts from "0"

int currentYear = c.get ( Calendar.YEAR );

if ( month == currentMonth && currentYear==year){

return true;

}

return false;

}

static public boolean validClaimsTimeRange( String monthStr, String yearStr ) {

if (monthStr == null || monthStr.equals("")){

return false;

}

if (yearStr == null || yearStr.equals("")){

return false;

}

int month=0;

int year=0;

try {

month = Integer.parseInt(monthStr);

year = Integer.parseInt(yearStr);

} catch (NumberFormatException e) {

return false;

}

return (validUploadDateRange(month,year));

}

static public String validUploadMonthAsStr( ) {

Calendar cal = Calendar.getInstance(TimeZone.getTimeZone(Constants.estTimeZone));

int currentMonth = cal.get ( Calendar.MONTH ); //note: this scale starts from "0"

int currentYear = cal.get ( Calendar.YEAR );

cal.set( currentYear, currentMonth-1 ,1);

cal.set(Calendar.HOUR_OF_DAY, 0);

cal.set(Calendar.HOUR, 0);

cal.set(Calendar.MINUTE, 0);

cal.set(Calendar.SECOND, 0);

Date pastMonth = cal.getTime();

String month = new SimpleDateFormat("MMMM").format(pastMonth);

return month;

}

}

Subscribe to: Posts (Atom)