Download as pdf or txt
Download as pdf or txt
You are on page 1of 29

Adina Institute of Science & Technology

Department of Computer Science & Engg.


M.Tech CSE-I Sem
Lab File
Advanced Computer Network(MCSE-105)
Advanced Computer Architecture(MCSE 103)

Submitted To: Submitted By:


INDEX

S.No NAME OF EXPERIMENT DATE OF DATE OF REMARK


PERFORMANCE SUBMISSION

1 Write a C program to implement


sliding window protocol & go-
back n protocol.

2 Implement date and time display


from local host to server using
TCP.

3 Implement Dijkstra’s algorithm to


compute the Shortest path in a
graph.

4 Write a client-server application


for chat using TCP.

5 Implementation of echo client


server using TCP/IP.

6 Case study of Architecture of Star


100 & T1-ASC.

7 Case study of Architecture of


Cyber-205

8 Case study of Pentium & power


PC addressing modes.

9 Case study of RISC pipelining.

10 Case study of Pentium 4.


Experiment-1

Aim: Write a C program to implement sliding window protocol & go-back n protocol.

// SLIDING WINDOW PROTOCOL


Client :
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
struct mymsgbuf
{
long mtype;
char mtext[25];
};
FILE *fp;
int main()
{
struct mymsgbuf buf;
int msgid;
int i=0,s;
int count=0,frmsz;
int a[100];
char d;
if((msgid=msgget(89,IPC_CREAT|0666))==-1)
{
printf("\n ERROR IN MSGGET");
exit(0);
}
printf("\n Enter the frame size:");
scanf("%d",&frmsz);
if((fp=fopen("check","r"))==NULL)
printf("\n FILE NOT OPENED");
else
printf("\n FILE OPENED");
while(!feof(fp))
{
d=getc(fp);
a[i]=d;
i++;
}
s=i;
for(i=0;i<frmsz;i++) //print from the check file
printf("\t %c",a[i]);
for(i=0;i<frmsz;i++)
{ if((msgrcv(msgid,&buf,sizeof(buf),0,1))==-1)
{
printf("\n ERROR IN MSGRCV");
exit(0);
}
printf("\n RECEIVED FRAMES ARE:%c",buf.mtext[i]);
}
for(i=0;i<frmsz;i++)
{ if(a[i]==buf.mtext[i])
count++;
} if(count==0)
{
printf("\n FRAMES WERE NOT RECEIVED IN CORRECT SEQ");
exit(0);
} if(count==frmsz)
{
printf("\n FRAMES WERE RECEIVED IN CORRECT SEQ");
} else
{
printf("\n FRAMES WERE NOT RECEIVED IN CORRECT SEQ");
}
}

Server
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
struct mymsgbuf
{ long mtype;
char mtext[25];
};
FILE *fp;
int main()
{s
truct mymsgbuf buf;
int si,ei,sz;
int msgid;
int i=0,s;
int a[100];
char d;
if((fp=fopen("send","r"))==NULL)
printf("\n FILE NOT OPENED");
else
printf("\n FILE OPENED");
printf("\n Enter starting and ending index of frame array:");
scanf("%d%d",&si,&ei);
sz=ei-si;
if((msgid=msgget(89,IPC_CREAT|0666))==-1)
{
printf("\n ERROR IN MSGGET");
exit(0);
}
while(!feof(fp))
{
d=getc(fp);
a[i]=d;
i++;
}s
=i;
buf.mtype=1;
for(i=si;i<=ei;i++)
{
buf.mtext[i]=a[i];
}
for(i=si;i<=ei;i++) //the frames to be sent
printf("\t %c",buf.mtext[i]);
for(i=0;i<=sz;i++)
{ if((msgsnd(msgid,&buf,sizeof(buf),0))==-1)
{
printf("\n ERROR IN MSGSND");
exit(0);
}}
printf("\n FRAMES SENT");
return 0;
}

Go back arq protocol:

Server program Code:


#include<stdio.h>
#include<string.h>
#include<sys/socket.h>
#include<sys/types.h>
#include<netinet/in.h>
#include<arpa/inet.h>
#define SIZE 4
main()
{
int std, lfd, len, i, j, status, sport;
char str[20], frame[20], temp[20], ack[20];
struct sockaddr_in saddr, caddr;
printf("Enter the port address");
scanf("%d", &sport);
std = socket(AF_INET, SOCK_STREAM, 0);
if(std<0)
perror("Error");
bzero(&saddr, sizeof(saddr));
saddr.sin_family = AF_INET;
saddr.sin_addr.s_addr = htonl(INADDR_ANY);
saddr.sin_port = htons(sport);
lfd = bind(std, (struct sockaddr *)&saddr, sizeof(saddr));
if(lfd)
perror("Bind Error");
listen(std, 5);
len = sizeof(&caddr);
lfd = accept(std, (struct sockaddr *)&caddr, &len);
printf("Enter the text:");
scanf("%s", str);
i = 0;
while(i<strlen(str))
{
memset(frame, 0, 20);
strncpy(frame, str+i, SIZE);
printf("\nTransmitting frames:");
len = strlen(frame);
for(j=0; j<len; j++)
{
printf("%d", i+j);
sprintf(temp, "%d", i+j);
strcat(frame, temp);
}
write(lfd, frame, sizeof(frame));
read(lfd, ack, 20);
sscanf(ack, "%d", &status);
if(status == -1)
printf("\nTransmission successful");
else
{
printf("Received error in: %d", status);
printf("\nRetransmitting frames");
for(j=0;;)
{
frame[j] = str[j+status];
j++;
printf("%d", j+status);
if((j+status)%4 == 0)
break;
}
printf("\n");
frame[j] = '\0';
len = strlen(frame);
for(j=0; j<len; j++)
{
sprintf(temp, "%d", j+status);
strcat(frame, temp);
}
write(lfd, frame, sizeof(frame));
}
i = i + SIZE;
}
write(lfd, "Exit", sizeof("Exit"));
printf("\nExitting!\n");
sleep(2);
close(lfd);
close(std);
}

Client program Code:


#include<stdio.h>
#include<string.h>
#include<sys/socket.h>
#include<sys/types.h>
#include<netinet/in.h>
#include<arpa/inet.h>
main()
{
int std, lfd, len, choice, cport;
char str[20], str1[20], err[20];
struct sockaddr_in saddr, caddr;
printf("Enter the port address:");
scanf("%d", &cport);
std = socket(AF_INET, SOCK_STREAM, 0);
if(std<0)
perror("Error");
bzero(&saddr, sizeof(saddr));
saddr.sin_family = AF_INET;
inet_pton(AF_INET, "127.0.0.1", &saddr.sin_addr);
saddr.sin_port = htons(cport);
connect(std, (struct sockaddr *)&saddr, sizeof(saddr));
for(;;)
{
read(std, str, 20);
if(strcmp(str, "Exit") == 0)
{
printf("Exitting!\n");
break;
}
printf("Received: %s\nError? 1 - S or 0 - NO", str);
scanf("%d", &choice);
if(choice == 0)
write(std, "-1", sizeof("-1"));
else
{
printf("Enter the sequence no of the frame where error has occured");
scanf("%s", err);
write(std, err, sizeof(err));
read(std, str, 20);
printf("Received the transmitted frame: %s\n", str);
}
}
close(std);}
Experiment-2

Aim: Implement date and time display from local host to server using TCP.

Server program Code:


//Program for TCP date-time. Server code.
#include<string.h>
#include<stdlib.h>
#include<unistd.h>
#include<stdio.h>
#include<sys/types.h>
#include<sys/socket.h>
#include<sys/time.h>
#include<time.h>
#include<netinet/in.h>
int main(int argc,char *argv[])
{
int listenfd,connfd;
struct sockaddr_in servaddr;
char buff[128+1];
time_t ticks;
unsigned short port;
port=(argc>1)?atoi(argv[1]):5500;
listenfd=socket(AF_INET,SOCK_STREAM,0);
if(listenfd<0)
{
perror("In listen()");
exit(1);
}
bzero((char *)&servaddr,sizeof(servaddr));
servaddr.sin_family=AF_INET;
servaddr.sin_addr.s_addr=htonl(INADDR_ANY);
servaddr.sin_port=htons(port);
if(bind(listenfd,(struct sockaddr *)&servaddr,sizeof(servaddr)))
{
perror("Error in binding");
exit(1);
}
if(listen(listenfd,7)<0)
{
perror("Error in listen");
exit(1);
}
for(;;)
{
connfd=accept(listenfd,(struct sockaddr *)NULL,NULL);
if(connfd<0)
{
perror("Error while connecting");
exit(1);
}
ticks=time(NULL);
sprintf(buff,"%.24s\r\n",ctime(&ticks));
write(connfd,buff,strlen(buff));
close(connfd);
exit(0);
}
}

Client program Code:


//Program for TCP date-time. Client code.
#include<string.h>
#include<stdlib.h>
#include<unistd.h>
#include<stdio.h>
#include<sys/types.h>
#include<sys/socket.h>
#include<sys/time.h>
#include<time.h>
#include<netinet/in.h>
#include<netdb.h>
#include<arpa/inet.h>
int main(int argc,char *argv[])
{
int sockfd,n;
char recvline[128];
struct sockaddr_in servaddr;
unsigned short port;
char *hostname;
struct hostent *hp;
hostname=(argc>1)?argv[1]:"localhost";
port=(argc>2)?atoi(argv[2]):5500;
if((hp=gethostbyname(hostname))==0)
{
perror("Error in getting host in gethostbyname()");
exit(0);
}
if((sockfd=socket(AF_INET,SOCK_STREAM,0))<0)
{
perror("Error while creation of socket");
exit(1);
}
bzero((char *)&servaddr,sizeof(servaddr));
servaddr.sin_family=AF_INET;
servaddr.sin_port=htons(port);
memcpy(&servaddr.sin_addr,hp->h_addr,hp->h_length);
if(servaddr.sin_addr.s_addr<=0)
{
perror("Bad address after gethostbyname()");
exit(1);
}
if(connect(sockfd,(struct sockaddr *)&servaddr,sizeof(servaddr))<0)
{
perror("Error while connecting");
exit(1);
}
printf("Today the daytime and date is:");
while((n=read(sockfd,recvline,sizeof(recvline)-1))>0)
{
recvline[n]=0;
if(fputs(recvline,stdout)==EOF)
{
perror("Error in fputs()");
exit(1);
}
}
if(n<0)
{
perror("Error while read");
exit(1);
}
exit(0);
}
Experiment-3

Aim: Implement Dijkstra’s algorithm to compute the Shortest path in a graph.

#include <stdio.h>
#include <conio.h>
#define MAX 1000

void dijkstra(int n,int v,int cost[10][10],int dist[10]);

int main()
{
int n,v,i,j,cost[10][10],dist[10];
printf("\n Enter the number of Nodes: ");
scanf("%d",&n);
printf("\n Enter the Weight Matrix:\n");
printf("\nEnter 1000 to denote Infinity\n");
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
scanf("%d",&cost[i][j]);
}
}
printf("\n Enter the Source Node:");
scanf("%d",&v);
dijkstra(n,v-1,cost,dist);
printf("\n Shortest Path from Node %d: ",v);
printf("\n#################################\n\n");
for(i=0;i<n;i++)
{
printf("Distance to Node:%d is %d\n",i+1,dist[i]);
}
getch();
return 0;
}

void dijkstra(int n,int v,int cost[10][10],int dist[10])


{
int i,u,count,w,flag[10],min;
for(i=0;i<n;i++)
{
flag[i]=0;
dist[i]=cost[v][i];
}
count=1;
while(count<n)
{
min=MAX;
for(w=0;w<n;w++)
{
if(dist[w]<min && !flag[w])
{
min=dist[w];
u=w;
}
}
flag[u]=1;
count++;
for(w=0;w<n;w++)
{
if((dist[u]+cost[u][w]<dist[w])&&!flag[w])
{
dist[w]=dist[u]+cost[u][w];
}
}
}
}
Experiment-4

Aim: Write a client-server application for chat using TCP.

Server program Code:


//Program for TCP chat.
#include<stdio.h>
#include<netinet/in.h>
#include<sys/socket.h>
#include<time.h>
#include<unistd.h>
#include<string.h>
int main()
{
int sd,sd2,nsd,clilen,sport,len;
char sendmsg[20],recmsg[20];
struct sockaddr_in cliadd,servaddr;
printf("\nEnter the port address:");
scanf("%d",&sport);
sd=socket(AF_INET,SOCK_STREAM,0);
if(sd<0)
printf("Error:Socket creation failed\n");
else
printf("Socket is created successfully\n");
servaddr.sin_addr.s_addr=htonl(INADDR_ANY);

servaddr.sin_port=htons(sport);
sd2=bind(sd,(struct sockaddr *)&servaddr,sizeof(servaddr));
if(sd2<0)
printf("Error in binding\n");
else
printf("Binding successful\n");
listen(sd,5);
clilen=sizeof(cliadd);
nsd=accept(sd,(struct sockaddr *)NULL,NULL);
if(nsd<0)
printf("Error: Cannot accept\n");
else
printf("Accept successful\n");
do
{
recv(nsd,recmsg,20,0);
printf("%s",recmsg);
fgets(sendmsg,20,stdin);
len=strlen(sendmsg);
sendmsg[len-1]='\0';
send(nsd,sendmsg,20,0);
}while(strcmp(sendmsg,"bye")!=0);
return 0;
}
Client program Code:
//Program for TCP chat.
#include<stdio.h>
#include<sys/socket.h>
#include<netinet/in.h>
#include<string.h>
int main()
{
int csd,cport,len;
char senmsg[20],recmsg[20];
struct sockaddr_in servaddr;
printf("\nEnter the port addrerss:");
scanf("%d",&cport);
csd=socket(AF_INET,SOCK_STREAM,0);
if(csd<0)
printf("Error: Socket creation failed\n");
else
printf("Socket is created successfully\n");
servaddr.sin_family=AF_INET;
servaddr.sin_addr.s_addr=htonl(INADDR_ANY);
servaddr.sin_port=htons(cport);
if(connect(csd,(struct sockaddr*)&servaddr,sizeof(servaddr))<0)
printf("Cannot connect\n");
else
printf("Connected\n");
do
{
fgets(senmsg,20, stdin);
len=strlen(senmsg);
senmsg[len-1]='\0';
send(csd,senmsg,len,0);
recv(csd,recmsg,20,0);
printf("\n%s",recmsg);
}while(strcmp(recmsg,"bye")!=0);
return 0;
}
Experiment-5

Aim: Implementation of echo client server using TCP/IP.

Server program Code:


//Program to implement echo server.
#include<sys/socket.h>
#include<sys/types.h>
#include<arpa/inet.h>
#include<unistd.h>
#include<stdlib.h>
#include<string.h>
#include<stdio.h>
#define MAX_LINE 1000
int main(int argc,char *argv[])
{
int list_s;
int conn_s;
int port;
struct sockaddr_in servaddr;
char buffer[MAX_LINE];
printf("\nEnter server port to listen:");
scanf("%d",&port);
if((list_s=socket(AF_INET,SOCK_STREAM,0))<0)
{
fprintf(stderr,"\nECHOSERV: Error creating listening socket\n");
exit(EXIT_FAILURE);
}
memset(&servaddr,0,sizeof(servaddr));
servaddr.sin_family=AF_INET;
servaddr.sin_addr.s_addr=htonl(INADDR_ANY);
servaddr.sin_port=htons(port);
if(bind(list_s,(struct sockaddr*)&servaddr,sizeof(servaddr))<0)
{
fprintf(stderr,"\nECHOSERV: Error calling bind()\n");
exit(EXIT_FAILURE);
}
if(listen(list_s,5)<0)
{
fprintf(stderr,"\nECHOSERV: Error calling listen()\n");
exit(EXIT_FAILURE);
}
while(1)
{
if((conn_s=accept(list_s,NULL,NULL))<0)
{
fprintf(stderr,"\nECHOSERV: Error in calling accept()\n");
exit(EXIT_FAILURE);
}
strcpy(buffer,"");
read(conn_s,buffer,20);
printf("\nMessage recived and echoed:%s\n",buffer);
write(conn_s,buffer,strlen(buffer));
if(close(conn_s)<0)
{
fprintf(stderr,"\nECHOSERV: Error in calling close()\n");
exit(EXIT_FAILURE);
}
else
{
printf("Connection closed\n");
exit(EXIT_SUCCESS);
}
}
}

Client program Code:


//Program to implement echoserver.
#include<sys/types.h>
#include<arpa/inet.h>
#include<unistd.h>
#include<stdlib.h>
#include<stdio.h>
#include<string.h>
#define MAX_LINE 1000
int main(int argc,char *argv[])
{
int conn_s;
int port;
struct sockaddr_in servaddr;
char buffer[MAX_LINE];
char *szAddress;
char *szPort;
char *endptr;
printf("\nRemote host address:%s\nRemote port no:%s",argv[1],argv[2]);
szAddress=argv[1];
szPort=argv[2];
port=strtol(szPort,&endptr,0);
if(*endptr)
{
printf("\nECHOCLNT: Invalid port supplied\n");
exit(EXIT_FAILURE);
}
if((conn_s=socket(AF_INET,SOCK_STREAM,0))<0)
{
printf("\nECHOCLNT: Error in creating socket\n");
exit(EXIT_FAILURE);
}
memset(&servaddr,0,sizeof(servaddr));
servaddr.sin_family=AF_INET;
servaddr.sin_port=htons(port);
if(inet_aton(szAddress,&servaddr.sin_addr)<=0)
{
printf("\nECHOCLNT: Invalid remote IP address");
exit(EXIT_FAILURE);
}
if(connect(conn_s,(struct sockaddr*)&servaddr,sizeof(servaddr))<0)
{
printf("\nECHOCLNT: Error in calling connect()\n");
exit(EXIT_FAILURE);
}
printf("\nEnter the string to echo:");
fgets(buffer,MAX_LINE,stdin);
write(conn_s,buffer,strlen(buffer)+1);
strcpy(buffer,"");
read(conn_s,buffer,20);
printf("\nEcho response:%s\n",buffer);
return EXIT_SUCCESS;
}
Experiment-6

Aim: Case study of Architecture of Star 100 & T1-ASC.

STAR-100

The STAR-100 was a vector supercomputer designed, manufactured, and marketed by Control
Data Corporation (CDC). It was one of the first machines to use a vector processor to improve
performance on appropriate scientific applications. The name STAR was a construct of the
words STrings and ARrays. The 100 came from 100 million floating point operations per second
(MFLOPS), the speed at which the machine was designed to operate. The computer was
announced very early during the 1970s and was supposed to be several times faster than the CDC
7600, which was then the world's fastest supercomputer with a peak performance of
36 MFLOPS. On August 17, 1971, CDC announced that General Motors had placed the first
commercial order for a STAR-100.

Architecture
In general organization, the STAR was similar to CDC's earlier supercomputers, where a simple
RISC-like CPU was supported by a number of peripheral processors that offloaded housekeeping
tasks and allowed the CPU to crunch numbers as quickly as possible. In the STAR, both the CPU
and peripheral processors were deliberately simplified, however, to lower the cost and
complexity of implementation. The STAR also differed from the earlier designs by being based
on a 64-bit architecture instead of 60-bit, a side effect of the increasing use of 8-bit ASCII
processing. Also unlike previous machines, the STAR made heavy use of microcode and also
supported a virtual memory capability.

The main innovation in the STAR was the inclusion of instructions for vector processing. These
new and more complex instructions approximated what was available to users of the APL
programming language and operated on huge vectors that were stored in consecutive locations in
the main memory. The CPU was designed to use these instructions to set up additional hardware
that fed in data from the main memory as quickly as possible. For instance, a program could use
single instruction with a few parameters to add all the elements in two vectors that could be as
long as 65,535 elements. The CPU only had to decode a single instruction, set up the memory
hardware, and start feeding the data into the math units. As with instruction pipelines in general,
the performance of any one instruction was no better than it was before, but since the CPU was
effectively working on a number of instructions at once (or in this case, data points) the overall
performance dramatically improves due to the assembly line nature of the task.

The STAR-100 uses I/O processors to offload I/O from the CPU. Each I/O processor is a 16-bit
minicomputer with its own main memory of 65,536 words of 16 bits each, which is implemented
with core memory. The I/O processors all share a 128-bit data bus to the SAC.

TI Advanced Scientific Computer

The Advanced Scientific Computer, or ASC, was a supercomputer architecture designed by


Texas Instruments (TI) between 1966 and 1973. Key to the ASC's design was a single high-
speed shared memory, which was accessed by a number of processors and channel controllers, in
a fashion similar to Seymour Cray's groundbreaking CDC 6600. Whereas the 6600 featured ten
smaller computers feeding a single math unit (ALU), in the ASC this was simplified into a single
8-core processor feeding the ALU. The 4-core ALU/CPU was one of the first to include
dedicated vector processing instructions, with the ability to send the same instruction to all four
cores.
Architecture
Memory was accessed solely under the control of the memory control unit, or MCU. The MCU
was a two-way, 256-bit/channel parallel network that could support up to eight independent
processors, with a ninth channel for accessing "main memory" (or "extended memory" as they
referred to it). The MCU also acted as a cache controller, offering high speed access on the eight
processor ports to a semiconductor-based memory, and handling all communications to the 24-
bit address space in main memory. The MCU was designed to operate asynchronously, allowing
it to work at a variety of speeds and scale across a number of performance points. For instance,
main memory could be constructed out of slower but less expensive core memory, although this
was not used in practice. At the fastest, it could sustain transfer rates of 80 million 32-bit words
per second per port, for a total transfer capacity of 640M-words/sec. This was well beyond the
capabilities of even the fastest memories of the era.

The main ALU/CPU was extremely advanced for its era. The design included four basic cores
that could be combined to handle vector instructions. Each core included a complete instruction
pipeline system that could keep up to twelve scalar instructions in-flight at the same time,
allowing up to 36 instructions in total across the entire CPU. From one to four vector results
could be produced every 60ns, the basic cycle time (about 16 MHz), depending on the number of
execution units provided. Implementations of this sort of parallel/pipelined instruction system
did not appear on modern commodity processors until the late 1990s, and vector instructions
(now known as SIMD) until a few years later. The processor included 48 32-bit registers, a huge
number for the time, although they were not general purpose as they are in modern designs.
Sixteen were used for addresses, another sixteen for math, eight for index offsets and another
eight for vector instructions. Registers were accessed externally using a RISC-like load/store
system, with instructions to load anything from 4-bits to 64-bit (two registers) at a time.

Most vector machines tended to be memory-limited, that is, they could process data faster than
they could get it from memory. This remains a major problem on modern SIMD designs as well,
which is why considerable effort has been put into increasing memory throughput in modern
computer designs (although largely unsuccessfully). In the ASC this was improved somewhat
with a look ahead unit that predicted upcoming memory accesses and loaded them into the ALU
registers invisibly, using a memory interface in the CPU known as the memory buffer unit
(MBU).

The "Peripheral Processor" was a separate system dedicated entirely to quickly running the
operating system and programs running within it, as well as feeding data to the main CPU. The
PP was built out of eight "virtual processors", VP's, which were designed to handle instructions
and basic integer math only. Each VP included its own program counter and registers, and the
system could thus run eight programs at the same time, limited by memory accesses. Keeping
eight programs running allowed the system to shuffle execution of programs on the main CPU
depending on what data was available on the memory bus at that time, attempting to avoid "dead
time" when the CPU was waiting on memory. This technique has also made its appearance in
modern CPU's, where it is known as simultaneous multithreading or, according to Intel, Hyper
Threading.
Experiment-7

Aim: Case study of Architecture of Cyber-205

The Cyber 205 system had its origins in the STAR-100 computer. The STAR-100 resulted from
a line of development at CDC separate from that which lead from that which led to the Cray-1.
This started in 1965 in response to a requirement of the Lawrence Livermore Laboratory for a
vector processor capable of executing 100 MFLOPS. A great deal of controversy raged about
this machine in its early years, and many of the essential design issues and performance goals
have been obscured. Despite the many difficulties which arose in the course of the STAR-100
programme, CDC remained convinced that the underlying architectural concepts of the STAR-
100 were sound, and went on to produce a second version, the STAR-100A, which appeared
commercially as the CYBER 203, and a further, completely re-engineered version, the STAR-
100C, which was produced commercially as the CYBER 205. In 1983 CDC formed a spin-off
company, ETA Systems Inc., with the goal of producing a multiprocessor system (the ETA 10),
based on the CYBER 205 architecture and having a performance capability of 10 GigaFLOPS. A
small number of these systems were sold commercially before the company closed down.

The STAR-100 was criticized on a number of grounds by users who wished to apply it to more
general computing problems than those for which it was designed. The grounds for criticism
were mainly the long vector start-up time and poor performance on scalar arithmetic, both of
which were inevitable consequences of the design. These problems are largely overcome in the
CYBER 205 by the use of a very much faster (80 ns access time) semiconductor memory and by
the inclusion of a high performance scalar unit. The overall performance of the CYBER 205 was
further enhanced by its implementation in specially developed ECL LSI Uncommitted Logic
Array technology, allowing a reduction of the clock period from the 40 ns used in the STAR-100
to 20 ns.

Architecture

The architecture of the CYBER 205 is similar to that of the STAR 100.Significant
architectural improvement include the addition of a scalar processor and related hardware, as
well as the availability of up to four floating-point pipelines. The major changes are:

1. A scalar processor execute instruction sequences which are not appropriate for vector
mode. The scalar protein contain independent function units.

2. An instruction issue unit coordinate instruction processing between the scalar


processor and the vector processor. It decodes instruction for both the scalar processor
and the vector processor, and it can issue a scalar instruction every minor cycle unless
there is conflict

3.A load/store unit controls the transfer of data between the register file and storage .It
also acts as a buffer holding data when storage cycle conflicts occurs .the load/store
unit is similar to the floating-point buffer and store data buffer of IBM 360/91.
Central Memory

To I/O
equipment
Vector Scalar unit I/O ports
pipelines

Features of CYBER 205


 Basic pipeline clock period in 20ns
 Each memory cycle takes 80ns.
 Only 26 LSI chip types are used which increases the system reliability and
maintainability.
 Bipolar main memory which access 4 million 64 bit words within 80ns clock cycle time.
 Memory access patterns includes 512 bit super words (8 64 bit word) for vector operands
and full words and half words (32 bit) for scalar operands
Experiment-8

Aim: Case study of Pentium & power PC addressing modes.

The Pentium series is an excellent example of Complex Instruction Set Computer (CISC) design.
The PowerPC is a direct descendant of IBM 801, one of the best designed RISC systems on the
market.

Pentium
Intel has ranked the number one maker of microprocessors for decades. Here is a brief history of
the evolution of microprocessors that Intel has been manufacturing.

PowerPC
In 1975, IBM started the 801 minicomputer project that launched the RISC movement. In 1986,
IBM developed a RISC workstation, the RT PC, which was not a commercial success. In 1990,
introduced the RISC/6000 and marketed that as a high performance workstation. IBM began to
refer to this as the POWER architecture.

IBM then entered into an alliance with Motorola the developer of the 68000 series for Apple
computers. The result of this alliance was the series of microprocessors that implement the
PowerPC architecture. The processors in the series were: 601, 603, 604, 620, 740/750 (G3), G4,
and G5. A complete description of the PowerPC ISA can be obtained from the IBM site.

Addressing Modes
Pentium

 Immediate: Operand = A
 Register operand: LA = R
 Displacement: LA = (SR) + A
 Base: LA = (SR) + (B)
 Base with displacement: LA = (SR) + (B) + A
 Scaled index with displacement: LA = (SR) + (I) x S + A
 Base with index and displacement: LA = (SR) + (B) + (I) + A
 Base with scaled index and displacement: LA = (SR) + (I) x S + (B) + A
 Relative: LA = (PC) + a

where
LA = linear address
(X) = contents of X
SR = segment register
PC = program counter
A = contents of an address field in the instruction
R = register
B = base register
I = index register
S = scaling factor
PowerPC
Load/Store Addressing

 Indirect: EA = (BR) + D
 Indirect indexed: EA = (BR) + (IR)

Branch Addressing

 Absolute EA = I
 Relative: EA = (PC) + 1
 Indirect: EA = (L / CR)

Fixed-point Computation

 Register: EA = GPR
 Immediate: Operand = I

Floating Point Computation

 Register: EA = FPR

where
EA = effective address
(X) = contents of X
BR = base register
IR = index register
L / CR = link or count register
GPR = general purpose register
FPR = floating point register
D = displacement
I = immediate value
PC = program counter

Data Types :
Pentium

 General - byte (8), word (16), doubleword (32), and quadword (64). signed integers are in
2's complement representation. Pentium uses little endian style representation.
 Floating point - single precision (32), double precision (64), extended double precision
(80)
 BCD - unpacked (1 byte per digit) and packed (1 byte per 2 digits) representation

PowerPC

 General - byte (8), halfword (16), word (32), and double word (64). PowerPC can operate
in little endian or big endian mode.
 Floating point - single precision (32), double precision (64)
 Byte string - 0 to 128 bytes in length

Registers:
Pentium

 General - Eight 32 bit general purpose registers - EAX, EBX, ECX, EDX, ESP, EBP,
ECI, and EDI. The low 16 bits of each of these registers act as 16 bit registers - AX, BX,
CX, DX, SP, BP, CI, and DI. The lower and higher 8 bits of each of these 16 bit registers
are also identified as registers - AL, BL, CL, DL, AH, BH, CH, and DH.
 Floating Point - Eight registers of 64 bit floating point numbers FP0 to FP7.
 Multimedia - Eight 64 bit multimedia registers MM0 to MM7.
 Segment - Six 16 bit segment selectors that index into segment tables - CS, SS, DS, ES,
FS, and GS. CS register references the segment containing the instruction being executed.
SS register references the segment containing the a user-visible stack. The remaining
segment registers enable the user to reference upto four separate data segments at a time.
 Flags register contains condition codes and various mode bits.
 Instruction Pointer (IP) - address of the current instruction

PowerPC

 General: Thirty two 64 bit general purpose registers R0 to R31.


 Exception Register (XER): Reports exceptions in integer arithmetic operations.
 General: Thirty two 64 bit general purpose registers for all floating point operations
FPR0 to FPR31.
 Floating point status and control register (FPSCR): 32-bit register that control the
operation of floating point quantities.
 Condition register: Consists of eight 4-bit condition code fields.
 Link register: Used in conditional branch instruction and for call / return.
 Count: Used to control an iteration loop.
Experiment-9

Aim: Case study of RISC pipelining.

In the history of computer hardware, some early reduced instruction set computer central
processing units (RISC CPUs) used a very similar architectural solution, now called a classic
RISC pipeline. Those CPUs were: MIPS, SPARC, Motorola 88000, and later the notional CPU
DLX invented for education.

Each of these classic scalar RISC designs fetched and attempted to execute one instruction per
cycle. The main common concept of each design was a five-stage execution instruction pipeline.
During operation, each pipeline stage would work on one instruction at a time. Each of these
stages consisted of an initial set of flip-flops and combinational logic which operated on the
outputs of those flip-flops.

The classic five stage RISC pipeline

Basic five-stage pipeline in a RISC machine (IF = Instruction Fetch, ID = Instruction Decode, EX =
Execute, MEM = Memory access, WB = Register write back). The vertical axis is successive instructions;
the horizontal axis is time. So in the green column, the earliest instruction is in WB stage, and the latest
instruction is undergoing instruction fetch.

Instruction fetch

The Instruction Cache on these machines had a latency of one cycle, meaning that if the
instruction was in the cache, it would be ready on the next clock cycle. During the Instruction
Fetch stage, a 32-bit instruction was fetched from the cache.

The Program Counter, or PC, is a register responsible for holding the address of the current
instruction. It feeds into the PC predictor which then sends the Program Counter (PC) to the
Instruction Cache to read the current instruction. At the same time, the PC predictor predicts the
address of the next instruction by incrementing the PC by 4 (all instructions were 4 bytes long).
This prediction was always wrong in the case of a taken branch, jump, or exception (see delayed
branches, below). Later machines would use more complicated and accurate algorithms (branch
prediction and branch target prediction) to guess the next instruction address.

Decode

Unlike earlier microcode machines, the first RISC machines had no microcode. Once fetched
from the instruction cache, the instruction bits were shifted down the pipeline, so that simple
combinational logic in each pipeline stage could produce the control signals for the datapath
directly from the instruction bits. As a result, very little decoding is done in the stage
traditionally called the decode stage. A consequence of this lack of decoding meant however that
more instruction bits had to be used specifying what the instruction should do (and also, what it
should not), and that leaves less bits for things like register indexes.

All MIPS, SPARC, and DLX instructions have at most two register inputs. During the decode
stage, these two register names are identified within the instruction, and the two registers named
are read from the register file. In the MIPS design, the register file had 32 entries.

At the same time the register file was read, instruction issue logic in this stage determined if the
pipeline was ready to execute the instruction in this stage. If not, the issue logic would cause
both the Instruction Fetch stage and the Decode stage to stall. On a stall cycle, the stages would
prevent their initial flip-flops from accepting new bits.

If the instruction decoded was a branch or jump, the target address of the branch or jump was
computed in parallel with reading the register file. The branch condition is computed after the
register file is read, and if the branch is taken or if the instruction is a jump, the PC predictor in
the first stage is assigned the branch target, rather than the incremented PC that has been
computed. It should be noted that some architectures made use of the ALU in the Execute stage,
at the cost of slightly decrease instruction throughput.

The decode stage ended up with quite a lot of hardware: the MIPS instruction set had the
possibility of branching if two registers were equal, so a 32-bit-wide AND tree ran in series after
the register file read, making a very long critical path through this stage. Also, the branch target
computation generally required a 16 bit add and a 14 bit incrementer. Resolving the branch in the
decode stage made it possible to have just a single-cycle branch mispredict penalty. Since
branches were very often taken (and thus mispredicted), it was very important to keep this
penalty low.

Execute

The Execute stage is where the actual computation occurs. Typically this stage consists of an
Arithmetic and Logic Unit, and also a bit shifter. It may also include a multiple cycle multiplier
and divider.

The Arithmetic and Logic Unit is responsible for performing boolean operations (and, or, not,
nand, nor, xor, xnor) and also for performing integer addition and subtraction. Besides the result,
the ALU typically provides status bits such as whether or not the result was 0, or if an overflow
occurred.

The bit shifter is responsible for shift and rotations.

Instructions on these simple RISC machines can be divided into three latency classes according
to the type of the operation:

 Register-Register Operation (Single-cycle latency): Add, subtract, compare, and logical


operations. During the execute stage, the two arguments were fed to a simple ALU,
which generated the result by the end of the execute stage.
 Memory Reference (Two-cycle latency). All loads from memory. During the execute
stage, the ALU added the two arguments (a register and a constant offset) to produce a
virtual address by the end of the cycle.
 Multi-cycle Instructions (Many cycle latency). Integer multiply and divide and all
floating-point operations. During the execute stage, the operands to these operations were
fed to the multi-cycle multiply/divide unit. The rest of the pipeline was free to continue
execution while the multiply/divide unit did its work. To avoid complicating the write
back stage and issue logic, multicycle instruction wrote their results to a separate set of
registers.

Memory access

If data memory needs to be accessed, it is done so in this stage.

During this stage, single cycle latency instructions simply have their results forwarded to the
next stage. This forwarding ensures that both single and two cycle instructions always write their
results in the same stage of the pipeline, so that just one write port to the register file can be used,
and it is always available.

For direct mapped and virtually tagged data caching, the simplest by far of the numerous data
cache organizations, two SRAMs are used, one storing data and the other storing tags.

Write back

During this stage, both single cycle and two cycle instructions write their results into the register
file.
Experiment-10

Aim: Case study of Pentium 4.

Pentium 4 is a line of single-core desktop, laptop and entry level server central processing units
(CPUs) introduced by Intel on November 20, 2000 and shipped through August 8, 2008.They
had a seventh-generation x86 microarchitecture, called Net Burst, which was the company's first
all-new design since the introduction of the P6 microarchitecture of the Pentium Pro CPUs in
1995. NetBurst differed from P6 (Pentium III, II, etc.) by featuring a very deep instruction
pipeline to achieve very high clock speeds. Intel claimed that NetBurst would allow clock speeds
of up to 10 GHz; however, severe problems with heat dissipation (especially with the Prescott
Pentium 4) limited CPU clock speeds to a much lower 3.8 GHz.

In 2004, the initial 32-bit x86 instruction set of the Pentium 4 microprocessors was extended by
the 64-bit x86-64 set.

The first Pentium 4 cores, codenamed Willamette, were clocked from 1.3 GHz to 2 GHz. They
were released on November 20, 2000, using the Socket 423 system. Notable with the
introduction of the Pentium 4 was the 400 MT/s FSB. It actually operated at 100 MHz but the
FSB was quad-pumped, meaning that the maximum transfer rate was four times the base clock of
the bus, so it was marketed to run at 400 MHz. The AMD Athlon's double-pumped FSB was
running at 100 or 133 MHz (200 or 266 MT/s) at that time.

Pentium 4 CPUs introduced the SSE2 and, in the Prescott-based Pentium 4s, SSE3 instruction
sets to accelerate calculations, transactions, media processing, 3D graphics, and games. Later
versions featured Hyper-Threading Technology (HTT), a feature to make one physical CPU
work as two logical CPUs. Intel also marketed a version of their low-end Celeron processors
based on the NetBurst microarchitecture (often referred to as Celeron 4), and a high-end
derivative, Xeon, intended for multiprocessor servers and workstations. In 2005, the Pentium 4
was complemented by the Pentium D and Pentium

Microarchitecture

In benchmark evaluations, the advantages of the NetBurst microarchitecture were unclear. With
carefully optimized application code, the first Pentium 4s outperformed Intel's fastest Pentium III
(clocked at 1.13 GHz at the time), as expected. But in legacy applications with many branching
or x87 floating-point instructions, the Pentium 4 would merely match or run more slowly than its
predecessor. Its main handicap was a shared unidirectional bus. Furthermore, the NetBurst
microarchitecture consumed more power and emitted more heat than any previous Intel or AMD
microarchitectures.

As a result, the Pentium 4's introduction was met with mixed reviews: Developers disliked the
Pentium 4, as it posed a new set of code optimization rules. For example, in mathematical
applications, AMD's lower-clocked Athlon (the fastest-clocked model was clocked at 1.2 GHz at
the time) easily outperformed the Pentium 4, which would only catch up if software was re-
compiled with SSE2 support. Tom Yager of Infoworld magazine called it "the fastest CPU - for
programs that fit entirely in cache". Computer-savvy buyers avoided Pentium 4 PCs due to their
price premium, questionable benefit, and initial restriction to Rambus RAM. In terms of product
marketing, the Pentium 4's singular emphasis on clock frequency (above all else) made it a
marketer's dream. The result of this was that the NetBurst microarchitecture was often referred to
as a architecture by various computing websites and publications during the life of the Pentium
4. It was also called "NetBust," a term popular with reviewers who reflected negatively upon the
processor's performance.

The two classical metrics of CPU performance are IPC (instructions per cycle) and clock speed.
While IPC is difficult to quantify due to dependence on the benchmark application's instruction
mix, clock speed is a simple measurement yielding a single absolute number. Unsophisticated
buyers would simply consider the processor with the highest clock speed to be the best product,
and the Pentium 4 had the fastest clock speed. Because AMD's processors had slower clock
speeds, it countered Intel's marketing advantage with the "megahertz myth" campaign. AMD
product marketing used a "PR-rating" system, which assigned a merit value based on relative
performance to a baseline machine.

Pentium 4 HT 3.00 GHz Prescott

At the launch of the Pentium 4, Intel stated that NetBurst-based processors were expected to
scale to 10 GHz after several fabrication process generations. However, the clock speed of
processors using the NetBurst microarchitecture reached a maximum of 3.8 GHz. Intel had not
anticipated a rapid upward scaling of transistor power leakage that began to occur as the die
reached the 90 nm lithography and smaller. This new power leakage phenomenon, along with the
standard thermal output, created cooling and clock scaling problems as clock speeds increased.
Reacting to these unexpected obstacles, Intel attempted several core redesigns ("Prescott" most
notably) and explored new manufacturing technologies, such as using multiple cores, increasing
FSB speeds, increasing the cache size, and using a longer instruction pipeline along with higher
clock speeds. These solutions failed, and from 2003 to 2005, Intel shifted development away
from NetBurst to focus on the cooler-running Pentium M microarchitecture. On January 5, 2006,
Intel launched the Core processors, which put greater emphasis on energy efficiency and
performance per clock cycle. The final NetBurst-derived products were released in 2007, with all
subsequent product families switching exclusively to the Core microarchitecture.

You might also like