Hiya Folks,

Is using regular expressions in asm possible?
if it is, then is it efficient and where can i find a library (if there is one) or how can i make one?

Regards,
Art
Posted on 2003-10-23 21:22:28 by art_sands

Hiya Folks,

Is using regular expressions in asm possible?
if it is, then is it efficient and where can i find a library (if there is one) or how can i make one?

Regards,
Art


Of course it is! After all, if you can do it in any other language you can certainly do it in assembly language :-). Now if you asking "is there a library routine to which I can pass a string containing a regular expression and that routine will match some other string..." then the answer is "probably, but I don't know where." Certainly such things exist, I just don't have a direct source.

What I can turn you on to, however, is the "UCR Standard Library for 80x86 Assembly Language Programmers" (MASM/DOS/16-bits) and the HLA Standard Library (32-bit/Win32/Linux). Both contain pattern matching libraries that support regular *and* context free languages. The HLA Standard Library is quite a bit more powerful (and more efficient) than the UCR stdlib version, but it does use HLA (and, unfortunately, the pattern matching library is one of the few parts of the HLA stdlib that didn't translate to MASM well, because of the limited macro capabilities of MASM).

I have attached a sample HLA program where I converted the "regular expression parser" example from "The Great Computer Language Shootout" into HLA (note that I've included the C example in a commented-out section of the program, look below the C code to see the equivalent HLA code). This is an interesting example as it clearly demonstrates that assembly language requires *fewer* lines of code than the comparable C code (and that doesn't even consider the fact that the C code in this example calls a non-standard library routine that wasn't included in the C source module!).

You can find HLA and the HLA Standard Library, as well as the UCR Standard Library, at http://webster.cs.ucr.edu.
Cheers,
Randy Hyde





// regexpGCLS
//
// This program demonstrates processing of regular expressions in
// assembly language. This is based on the "regexp.gcc"
// program that is part of "The Great Computer
// Language Shoot-out" found at
//
// [url]http://www.bagley.org/~doug/shootout/[/url]
//
// The purpose of that web page is to demonstrate several
// applications written in various languages. Although one
// of the primary purposes of that web site is to demonstrate
// the different run-time efficiencies of various languages,
// this HLA implementation was not created to demonstrate
// that assembly code is faster or smaller (everyone pretty
// much accepts the fact that the fastest and/or smallest
// example of a program will be written in assembly language).
// Instead, this example demonstrates that with the use of
// a high level assembler (e.g., HLA), it's also possible to
// write code almost as easily as in a high level language
// like C. As such, this example freely sacrifices efficiency
// for readability.

#if( false )

/* -*- mode: c -*-
* $Id: regexmatch.gcc,v 1.4 2000/12/24 05:43:53 doug Exp $
* [url]http://www.bagley.org/~doug/shootout/[/url]
*/

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <pcre.h>
#include <string.h>

#define MAXLINES 100
#define MAXLINELEN 132

char *pattern =
"(?:^|[^\\d\\(])" /* must be preceeded by non-digit */
"(\\()?" /* match 1: possible initial left paren */
"(\\d\\d\\d)" /* match 2: area code is 3 digits */
"(?(1)\\))" /* if match1 then match right paren */
"[ ]" /* area code followed by one space */
"(\\d\\d\\d)" /* match 3: prefix of 3 digits */
"[ -]" /* separator is either space or dash */
"(\\d\\d\\d\\d)" /* match 4: last 4 digits */
"\\D" /* must be followed by a non-digit */
;


int
main(int argc, char *argv[]) {
int NUM = ((argc == 2) ? atoi(argv[1]) : 1);
int count;
char *cptr = "";
char **phones;
pcre *re;
int erroffset;
const char *errptr;
int n, lines = 0;
char num[256];
int i, j, k, matchlen;
char *matchoffset;
int nmatches;
int *ovec, ovecsize;
pcre_extra *study;

phones = (char **)malloc(MAXLINES * sizeof(char *));
if (!phones) {
fprintf(stderr, "malloc for phones array failed\n");
exit(1);
}
lines = 0;
while (cptr) {
phones[lines] = (char *)malloc(MAXLINELEN);
if (!phones[lines]) {
fprintf(stderr, "malloc to hold line #%d failed\n", lines);
exit(1);
}
cptr = fgets(phones[lines], MAXLINELEN, stdin);
lines++;
if (lines > MAXLINES) {
fprintf(stderr, "MAXLINES is too small\n");
exit(1);
}
}

re = pcre_compile(pattern, 0, &errptr, &erroffset, NULL);
if (!re) {
fprintf(stderr, "can't open compile regexp\n");
exit(1);
}

study = pcre_study(re, 0, &errptr);

if (pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &nmatches) != 0) {
fprintf(stderr, "pcre_fullinfo failed\n");
exit(1);
}
nmatches++; /* add match of entire pattern */

ovecsize = sizeof(int) * nmatches * 3;
ovec = (int *)malloc(ovecsize);
if (!ovec) {
fprintf(stderr, "malloc for ovec array failed\n");
exit(1);
}

count = 0;
while (NUM--) {
for (i=0; i<lines; i++) {
n = pcre_exec(re, study,
phones[i], strlen(phones[i]), 0,
0, ovec, ovecsize);
if (n == nmatches) {
/* stuff the match into the buffer "num" */
k = 2*2; /* initial offset into ovec */
/* areacode */
j = 0;
num[j++] = '(';
matchoffset = phones[i] + ovec[k];
matchlen = ovec[k+1] - ovec[k];
strncpy(num+j, matchoffset, matchlen);
j += matchlen; k += 2;
num[j++] = ')';
/* space separator */
num[j++] = ' ';
/* exchange */
matchoffset = phones[i] + ovec[k];
matchlen = ovec[k+1] - ovec[k];
strncpy(num+j, matchoffset, matchlen);
j += matchlen; k += 2;
/* dash */
num[j++] = '-';
/* last 4 digits */
matchoffset = phones[i] + ovec[k];
matchlen = ovec[k+1] - ovec[k];
strncpy(num+j, matchoffset, matchlen);
j += matchlen; k += 2;
/* with a cherry on top */
num[j] = 0;
if (0 == NUM) {
count++;
printf("%d: %s\n", count, num);
}
}
}
}

for (i=0; i<MAXLINES; i++) {
free(phones[i]);
}
free(phones);
free(ovec);

return(0);
}
#endif


program regexp;
#include( "stdlib.hhf" )
const
MaxLines := 100;

static
f :dword;
i :uns32;
filename :string;
lineCnt :uns32;
areaCode :str.strvar(16);
prefix :str.strvar(16);
suffix :str.strvar(16);
lines :string[ MaxLines ];

begin regexp;

if( arg.c() != 2 ) then

stdout.put( "Usage: regexp <filename>" nl );
exit regexp;

endif;
mov( fileio.open( arg.v( 1 ), fileio.r ), f );
mov( 0, ebx );
while( !fileio.eof( f )) do

fileio.a_gets( f );
mov( eax, lines[ ebx*4 ] );
inc( ebx );

endwhile;
mov( ebx, lineCnt );
fileio.close( f );
for( mov( 0, i ); mov( i, edx ) < lineCnt; inc( i )) do

pat.match( lines[ edx*4 ] );

pat.zeroOrMoreCset( -{ '(','0'..'9' } );
pat.zeroOrOneChar( '(' );
pat.exactlyNCset( {'0'..'9'}, 3 );
pat.extract( areaCode );
pat.zeroOrOneChar( ')' );
pat.zeroOrMoreWS();
pat.exactlyNCset( {'0'..'9'}, 3 );
pat.extract( prefix );
pat.oneOrMoreCset( {'-', ' '} );
pat.exactlyNCset( {'0'..'9'}, 4 );
pat.extract( suffix );

stdout.put( i:2,": (", areaCode, ") ", prefix, '-', suffix, nl );

pat.if_failure;

pat.endmatch;

endfor;

end regexp;

Posted on 2003-10-23 23:34:20 by rhyde
Give a look at Perl Compatible Regular Expression library by Philip Hazel at http://www.pcre.org/

The library is free, even for commercial applications !!
Posted on 2003-10-24 02:03:50 by pelaillo
Thanks guys,

Dear Randy,

i came to know about it later while reading your book. So was feeling a bit stupid. But anyway, seeing the above example, HLA looks to be one of the best assembly language flavor that i've ever come across. Also, it looks a lot easier to understand, so i guess i might be learning and using it after all. gr8 work. Thanks

Dear pelaillo,

Thanks for showing me that library. gr8 work u too.

Regards,
Art
Posted on 2003-10-24 06:22:59 by art_sands