Thursday, March 8, 2012

Parallelizable integer multiplication

At a recent interview with an Amazon, the hiring manager asked me about an algorithm to do integer multiplication without using any multiplication/division symbol in parallel computing.  The following algorithm was suggested to do integer multiplication without using any "MULT" instruction:


int mult(int x, int y)
{
 int z;

 if ((x==0) || (y==0))
  return 0;
 if (x==1)
  return y;

 if (y==1)
  return x;

 z = mult(x, y>>1);
 if (y % 2 == 0)
  return z<<1;
 else
  return x + (z<<1);
}

My question came up, is this parallelizable?  My argument to the interviewer was that this is not a good approach if he is asking from parallel computation perspective (See Robertazzi's papers on ACM journals to know more detail about parallel computation)

When a recursive call is made, the parameters are pushed into the stack before calling each subsequence recursive call.  Involving stack in this manner is hardly parallelizable, because there is coupling of current result based on previous results.  One of the principle of parallel computing is to decouple two or more computation as much as we can.

A dig into highly-optimized assembly language of the above code:




mult:
 pushl %ebp
 movl %esp, %ebp
 subl $24, %esp
 movl %ebx, -8(%ebp)
 movl %esi, -4(%ebp)
 movl 12(%ebp), %ebx
 movl 8(%ebp), %esi
 testl %ebx, %ebx
 je .L4
 testl %esi, %esi
 je .L4
 cmpl $1, %esi
 je .L2
 cmpl $1, %ebx
 .p2align 4,,3
 je .L5
 movl %ebx, %eax
 movl %esi, (%esp)
 sarl %eax
 movl %eax, 4(%esp)
 call mult
 andl $1, %ebx
 je .L7
 leal (%esi,%eax,2), %ebx
.L2:
 movl %ebx, %eax
 movl -4(%ebp), %esi
 movl -8(%ebp), %ebx
 leave
 ret
 .p2align 4,,10
 .p2align 3
.L4:
 xorl %ebx, %ebx
 movl -4(%ebp), %esi
 movl %ebx, %eax
 movl -8(%ebp), %ebx
 leave
 ret
 .p2align 4,,10
 .p2align 3
.L7:
 leal (%eax,%eax), %ebx
 movl -4(%ebp), %esi
 movl %ebx, %eax
 movl -8(%ebp), %ebx
 leave
 ret
 .p2align 4,,10
 .p2align 3
.L5:
 movl %esi, %ebx
 movl -4(%ebp), %esi
 movl %ebx, %eax
 movl -8(%ebp), %ebx
 leave
 ret


As can be seen, "SARL" (Shift Arithmetic Left) for the shift-left operation.  Some issues after analyzing this code.  First, there are multiple register-to-memory flow while doing the recursive:


movl %ebx, %eax
 movl %esi, (%esp)
 sarl %eax
 movl %eax, 4(%esp)


Memory access is expensive and the instructions might cause cache miss thus forcing CPU to reread data from RAM.
Another issue, how do we parallelize this, if there is stack push-pop coupling for the result?


My argument was to decompose the computation into two (or more) portions.



For example:

int pivot = b/2;

for(i=0; i<pivot; i++)
    sum1 += a;

for(i=pivot+1; i<b; i++)
    sum2 += a;
sum = sum1 + sum2;


No, let's check what we can see in the x86 assembly (gcc-generated):

mult:
 pushl %ebp
 movl %esp, %ebp
 subl $8, %esp
 movl 12(%ebp), %eax
 movl %ebx, (%esp)
 movl %esi, 4(%esp)
 movl 8(%ebp), %edx
 testl %eax, %eax
 je .L5
 testl %edx, %edx
 je .L5
 cmpl $1, %edx
 je .L2
 cmpl $1, %eax
 .p2align 4,,3
 je .L6
 movl %eax, %ecx
 xorl %esi, %esi
 shrl $31, %ecx
 xorl %ebx, %ebx
 addl %eax, %ecx
 sarl %ecx
 testl %ecx, %ecx
 jle .L3
 movl %ecx, %esi
 movl %ecx, %ebx
 imull %edx, %esi
.L3:
 xorl %ecx, %ecx
 cmpl %ebx, %eax
 jle .L4
 movl %eax, %ecx
 subl %ebx, %ecx
 imull %edx, %ecx
.L4:
 leal (%ecx,%esi), %eax
.L2:
 movl (%esp), %ebx
 movl 4(%esp), %esi
 leave
 ret
 .p2align 4,,10
 .p2align 3
.L5:
 xorl %eax, %eax
 movl (%esp), %ebx
 movl 4(%esp), %esi
 leave
 ret
 .p2align 4,,10
 .p2align 3
.L6:
 movl %edx, %eax
 jmp .L2

As can be seen, most of operations are in registers, also there is decoupling between 2 partitions (e.g, sum1 does not rely on sum2). The more we partition the calculation (e.g, pivot1 = b/4, and do 4 for-next loops for each partition), the more divisible partition can be accomplished by parallel computer. I was arguing that to optimize computation, hence lowering O(n), we should use divisible loops ("divide-et-empera") instead of recursive approach (seems the interviewer disagreed)

PS: My argument above caused me not to land job at Amazon.

Sunday, February 26, 2012

Simple UDP Client

<pre>

/* UDP client in the internet domain */
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

#define MIN(x, y)  ((x) < (y) ? (x) : (y))
#define MAX(x, y)  ((x) > (y) ? (x) : (y))

void error(const char *);

int main(int argc, char *argv[])
{
   int sock, n;
   unsigned int length;
   struct sockaddr_in server, from;
   struct hostent *hp;
   char buffer[256];

   if (argc < 3) {
        printf("Usage: %s <port>\n", argv[0]);
        exit(1);
   }
   sock= socket(AF_INET, SOCK_DGRAM, 0);
   if (sock < 0) error("socket");

   server.sin_family = AF_INET;
   hp = gethostbyname(argv[1]);
   if (hp==0) error("Unknown host");

   bcopy((char *)hp->h_addr,
        (char *)&server.sin_addr,
         hp->h_length);
   server.sin_port = htons(atoi(argv[2]));
   length=sizeof(struct sockaddr_in);
   printf("Please enter the message: ");
   bzero(buffer, sizeof(buffer));
   fgets(buffer, sizeof(buffer)-1, stdin);
   n=sendto(sock,buffer,
            strlen(buffer),0,(const struct sockaddr *)&server,length);
   if (n < 0)
       error("Sendto");
   n = recvfrom(sock, buffer, sizeof(buffer), 0, (struct sockaddr *)&from, &length);
   if (n < 0)
       error("recvfrom");
   printf("Got an ack: %d bytes\n", n);
   write(1, buffer, MIN(n, strlen(buffer)));
   close(sock);
   return 0;
}

void error(const char *msg)
{
    perror(msg);
    exit(0);
}

</pre>

Simple UDP Server

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <netdb.h>
#include <unistd.h>
#include <errno.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <arpa/inet.h>

extern int h_errno;

void error(const char *msg);

int UdpSocket(int protocol)
{
 int sockfd;

    sockfd = socket(AF_INET, SOCK_DGRAM, 0);
 if (-1 == sockfd)
 {
  error(__func__);
 }
 return sockfd;
}


void error(const char *msg)
{
 printf("Error Code: %d", errno);
 perror(msg);
}


int BindSocketToAddr(int sock, struct sockaddr_in *addr, int port)
{
 int rc;

    if (!addr)
    {
        printf("%s: NULL pointer in addr\n", __func__);
        return -1;
    }
 bzero(addr, sizeof(struct sockaddr_in));
 addr->sin_family = AF_INET;
 addr->sin_addr.s_addr = INADDR_ANY;
 addr->sin_port = htons(port);
 rc = bind(sock, (struct sockaddr *)addr, sizeof(struct sockaddr));
 if (rc < 0)
 {
  perror(__func__);
 }
    return rc;
}



int main(int argc, char **argv)
{
 int sockfd;
 struct sockaddr_in fromSockAddr;
 struct sockaddr_in serverSockAddr;
 socklen_t fromlen;
 char buf[1024];
 char fromAddrStr[512];
 int n;

    if (argc < 2)
    {
        printf("\n%s <PORT NUMBER>\n\n", argv[0]);
        exit(0);
    }
    setuid(0);

 printf("UDP Server\n");

 sockfd = UdpSocket(0);
 if (sockfd > 0)
 {
  BindSocketToAddr(sockfd, &serverSockAddr, atoi(argv[1]));
  fromlen = sizeof(fromSockAddr);
  while (1) {
   /* wait for client to send a message */
   n = recvfrom(sockfd, buf, sizeof(buf), 0, (struct sockaddr *)&fromSockAddr, &fromlen);
   if (n < 0)
   {
    error("recvfrom");
    break;
   }
   else {
    inet_ntop(fromSockAddr.sin_family, &fromSockAddr.sin_addr, fromAddrStr, sizeof(fromAddrStr));
    printf("GOT msg from: %s\n", fromAddrStr);
    /* echo back */
    n = sendto(sockfd, buf, n, 0, (struct sockaddr *)&fromSockAddr, sizeof(fromSockAddr));
    if (n < 0)
    {
     error("sendto");
     break;
    }
   }
  }
 }
 return 0;
}

Saturday, February 4, 2012

Intro to Ruby

Just got my hands dirty with Ruby today. It's an interesting language, yet simple to learn. Unlike some other dynamic programming languages that too cryptic to learn, Ruby is easy to learn, similar to Tcl.

Here's an example of a program I tested:


#!/usr/bin/ruby

presidents = ["Ford", "Carter", "Reagan", "Bush1", "Clinton", "Bush2", "Obama"]

for fwd in [-1,1]
    if fwd == 1 
        print "Forward/ascending\n"
        dec = 0
    else
        print "Backward/descending\n"
        dec = -1
    end
    for ss in 0...presidents.length
        print ss+1, ": ", presidents[ss*fwd+dec], "\n";
    end
end


The output:


Backward/descending
1: Obama
2: Bush2
3: Clinton
4: Bush1
5: Reagan
6: Carter
7: Ford
Forward/ascending
1: Ford
2: Carter
3: Reagan
4: Bush1
5: Clinton
6: Bush2
7: Obama

Sunday, November 6, 2011

TinyXML

#include <ltinyxml.h>

// ----------------------------------------------------------------------
// STDOUT dump and indenting utility functions
// ----------------------------------------------------------------------
const unsigned int NUM_INDENTS_PER_SPACE=2;

const char * getIndent( unsigned int numIndents )
{
 static const char * pINDENT="                                      + ";
 static const unsigned int LENGTH=strlen( pINDENT );
 unsigned int n=numIndents*NUM_INDENTS_PER_SPACE;
 if ( n > LENGTH ) n = LENGTH;
 
 return &pINDENT[ LENGTH-n ];
}

// same as getIndent but no "+" at the end
const char * getIndentAlt( unsigned int numIndents )
{
 static const char * pINDENT="                                        ";
 static const unsigned int LENGTH=strlen( pINDENT );
 unsigned int n=numIndents*NUM_INDENTS_PER_SPACE;
 if ( n > LENGTH ) n = LENGTH;
 
 return &pINDENT[ LENGTH-n ];
}

int dump_attribs_to_stdout(TiXmlElement* pElement, unsigned int indent)
{
 if ( !pElement ) return 0;
 
 TiXmlAttribute* pAttrib=pElement->FirstAttribute();
 int i=0;
 int ival;
 double dval;
 const char* pIndent=getIndent(indent);
 printf("\n");
 while (pAttrib)
 {
  printf( "%s%s: value=[%s]", pIndent, pAttrib->Name(), pAttrib->Value());
  
  if (pAttrib->QueryIntValue(&ival)==TIXML_SUCCESS)    printf( " int=%d", ival);
  if (pAttrib->QueryDoubleValue(&dval)==TIXML_SUCCESS) printf( " d=%1.1f", dval);
  printf( "\n" );
  i++;
  pAttrib=pAttrib->Next();
 }
 return i;
}

void dump_to_stdout( TiXmlNode* pParent, unsigned int indent = 0 )
{
 if ( !pParent ) return;
 
 TiXmlNode* pChild;
 TiXmlText* pText;
 int t = pParent->Type();
 printf( "%s", getIndent(indent));
 int num;
 
 switch ( t )
 {
  case TiXmlNode::TINYXML_DOCUMENT:
   printf( "Document" );
   break;
   
  case TiXmlNode::TINYXML_ELEMENT:
   printf( "Element [%s]", pParent->Value() );
   num=dump_attribs_to_stdout(pParent->ToElement(), indent+1);
   switch(num)
   {
    case 0:  printf( " (No attributes)"); break;
    case 1:  printf( "%s1 attribute", getIndentAlt(indent)); break;
    default: printf( "%s%d attributes", getIndentAlt(indent), num); break;
   }
   break;
   
    case TiXmlNode::TINYXML_COMMENT:
     printf( "Comment: [%s]", pParent->Value());
     break;
     
    case TiXmlNode::TINYXML_UNKNOWN:
     printf( "Unknown" );
     break;
     
    case TiXmlNode::TINYXML_TEXT:
     pText = pParent->ToText();
     printf( "Text: [%s]", pText->Value() );
     break;
     
    case TiXmlNode::TINYXML_DECLARATION:
     printf( "Declaration" );
     break;
    default:
     break;
 }
 printf( "\n" );
 for ( pChild = pParent->FirstChild(); pChild != 0; pChild = pChild->NextSibling())
 {
  dump_to_stdout( pChild, indent+1 );
 }
}

// load the named file and dump its structure to STDOUT
void dump_to_stdout(const char* pFilename)
{
 TiXmlDocument doc(pFilename);
 bool loadOkay = doc.LoadFile();
 if (loadOkay)
 {
  printf("\n%s:\n", pFilename);
  dump_to_stdout( &doc ); // defined later in the tutorial
 }
 else
 {
  printf("Failed to load file \"%s\"\n", pFilename);
 }
}


void build_simple_doc( )
{
 // Make xml: World
 TiXmlDocument doc;

 TiXmlDeclaration * declaration = new TiXmlDeclaration( "1.0", "UTF-8", "" );
 TiXmlElement * root = new TiXmlElement( "mipsdiag" );

 doc.LinkEndChild( declaration );
 doc.LinkEndChild( root );

 TiXmlElement * cpu = new TiXmlElement( "cpu" );
 root->LinkEndChild(cpu);
 
 TiXmlComment * comment = new TiXmlComment();
 comment->SetValue("-- CPU utilization --" );
 cpu->LinkEndChild(comment);

 TiXmlElement *cpu_res = new TiXmlElement("cpu_resouce");
 cpu_res->SetAttribute("type", "utilization");
 cpu_res->SetAttribute("units", "percent");
 cpu->LinkEndChild(cpu_res);

 TiXmlText *cpu_utilization = new TiXmlText("0.11");
 cpu_res->LinkEndChild(cpu_utilization);

 dump_to_stdout( &doc );
 doc.SaveFile( "mipsdiag.xml" );
}


int main()
{
 build_simple_doc();
}

Tuesday, October 18, 2011

Ooma has dialtone, but unable to make calls

There might be a bug in Old Ooma Broadband Voip device which for some reason could not establish voice connection (although it could connect to Ooma gateway  and we could hear its unique dialtone).

To fix it is actually very simple, just unplug the power to it and leave it like that for about 15 seconds, and than replug it