/*
* udf_flevtok.cc: 03'2005
* (schoenfr@gaertner.de)
*
* implements a user defined function for the mysql server.
*
* returned is a maximum similarity number.
* it is calculated with help of the minimum levenshtein distance of a word
* to the tokens of a string.
* the tokens are seperated by whitespaces.
* the maximum similarity ( 0.0 < x <= 1.0) is returned.
*
* this allows some fuzzy lookup of a word in a sentence.
*
*
* g++ -shared -I/usr/local/mysql/include/mysql -o udf_flevtok.so \
* udf_flevtok.cc
*
* CREATE FUNCTION flevtok RETURNS REAL SONAME "udf_flevtok.so";
* DROP FUNCTION flevtok;
*
* use: int = levtok ( string words, string token, bool ignore_case)
*
* to get the minimum levenshtein distance for any word in words
* checked against the token word.
*
* select flevtok("nase", "Nase", 1) -> 1.0
* select flevtok("nase", "Nase", 0) -> 0.75
* select flevtok("der hase mit der nase", "nase", 0) -> 1.0
* select flevtok("nas en bear", "nase", 0) -> 0.67
* select flevtok("hasen ohren", "nase", 0) -> 0.5
* select flevtok("nah", "", 0) -> 0
* select flevtok("a", NULL, 0) -> NULL
*
* see also: http://www.bertnase.de/html/stuff.html
*
* bugs: - if the token-word contains whitespaces the result is funny.
*/
char *udf_flevtok_version = "udf_flevtok version: v1.0d, 15.03.2005";
/* Copyright (C) 2005 schoenfr@gaertner.de
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef STANDALONE
# include <stdio.h>
# include <stdlib.h>
# include <malloc.h>
# include <ctype.h>
# include <string.h>
#else
# ifdef STANDARD
# include <stdio.h>
# include <string.h>
# ifdef __WIN__
typedef unsigned __int64 ulonglong; /* Microsofts 64 bit types */
typedef __int64 longlong;
# else
typedef unsigned long long ulonglong;
typedef long long longlong;
# endif /*__WIN__*/
# else
# include <my_global.h>
# include <my_sys.h>
# endif
# include <mysql.h>
# include <m_ctype.h>
# include <m_string.h> // To get strmov()
#endif /* ! STANDALONE */
static double lev_tok (char *line, char *tok);
/* this function is from: http://www.merriampark.com/ldc.htm */
static int levenshtein_distance(char *s,char*t);
#define min2(a,b) ((a) < (b) ? (a) : (b))
#define max2(a,b) ((a) > (b) ? (a) : (b))
static void
buf_tolower (char *s)
{
int i;
while (*s) {
*s = tolower (*s);
s++;
}
}
double
lev_tok (char *buf, char *tok)
{
int buf_len = strlen(buf);
int tok_len = strlen(tok);
double ret_val;
char *ptr = buf;
int min_len = min2(tok_len, buf_len);
/*
* some specials:
*/
if (buf_len == tok_len && ! strcmp (buf, tok)) {
return 1.0;
} else if (buf_len == 0 || tok_len == 0) {
return 0.0;
}
/* no similarity: */
ret_val = 0.0;
while (*ptr)
{
char *p, old_char;
int rc;
/*
* skip over white:
*/
while (*ptr && isspace(*ptr)) {
ptr++;
}
// if (! *ptr) {
// return ret_val;
// }
/*
* scan end of token:
*/
p = ptr;
while (*p && ! isspace(*p)) {
p++;
}
old_char = *p;
*p = 0;
rc = levenshtein_distance (ptr, tok);
if (! rc) {
/* exact match: */
return 1.0;
}
if (rc != -1) {
double f = 1.0 - (rc / (double) min2(strlen (ptr), tok_len));
if (f > ret_val) {
ret_val = f;
}
}
*p = old_char;
ptr = p;
}
return ret_val;
}
#ifdef HAVE_DLOPEN
extern "C" {
my_bool flevtok_init (UDF_INIT *initid, UDF_ARGS *args, char *message);
void flevtok_deinit (UDF_INIT *initid);
double flevtok (UDF_INIT *initid, UDF_ARGS *args, char *is_null,
char *error);
}
my_bool
flevtok_init (UDF_INIT *initid, UDF_ARGS *args, char *message)
{
char **buf_ptr;
if (args->arg_count != 3
|| args->arg_type[0] != STRING_RESULT
|| args->arg_type[1] != STRING_RESULT
|| args->arg_type[2] != INT_RESULT)
{
strcpy (message,
"FLEVTOK() requires three arguments (str text, str word, bool case)");
return 1;
}
initid->max_length = 11; // max result length (?)
initid->maybe_null = 0;
if (! (buf_ptr = (char **) malloc (2 *sizeof (char *)))
|| ! (buf_ptr [0] = (char *) malloc (args->lengths [0] + 1))
|| ! (buf_ptr [1] = (char *) malloc (args->lengths [1] + 1)))
{
strmov (message, "flevtok: Couldn't allocate memory");
return 1;
}
initid->ptr = (char *) buf_ptr;
return 0;
}
/****************************************************************************
** Deinit function. This should free all resources allocated by
** this function.
** Arguments:
** initid Return value from xxxx_init
****************************************************************************/
void flevtok_deinit (UDF_INIT *initid)
{
if (initid->ptr) {
char **buf_ptr = (char **) initid->ptr;
free (buf_ptr [1]);
free (buf_ptr [0]);
free (initid->ptr);
}
}
/***************************************************************************
** UDF string function.
** Arguments:
** initid Structure filled by xxx_init
** args The same structure as to xxx_init. This structure
** contains values for all parameters.
** Note that the functions MUST check and convert all
** to the type it wants! Null values are represented by
** a NULL pointer
** result Possible buffer to save result. At least 255 byte long.
** length Pointer to length of the above buffer. In this the function
** should save the result length
** is_null If the result is null, one should store 1 here.
** error If something goes fatally wrong one should store 1 here.
**
** This function should return a pointer to the result string.
** Normally this is 'result' but may also be an alloced string.
***************************************************************************/
double
flevtok (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error)
{
char **buf_ptr, *arg1, *arg2;
longlong arg3;
double rc = 0.0;
*error = 0;
arg1 = args->args [0];
arg2 = args->args [1];
arg3 = * ((longlong *) args->args[2]);
if (! arg1 || ! arg2)
{
*is_null = 1;
return rc;
}
buf_ptr = (char **) initid->ptr;
/*
* make sure we have a trailing zero byte, so we ignore
* binary data at treat the arg as a zero terminated string.
*/
memcpy (buf_ptr [0], args->args [0], args->lengths [0]);
buf_ptr [0] [args->lengths [0]] = 0;
memcpy (buf_ptr [1], args->args [1], args->lengths [1]);
buf_ptr [1] [args->lengths [1]] = 0;
if (arg3 != 0) {
buf_tolower (buf_ptr [0]);
buf_tolower (buf_ptr [1]);
}
rc = lev_tok (buf_ptr [0], buf_ptr [1]);
return rc;
}
#endif /* ! HAVE_DLOPEN */
#ifdef STANDALONE
int
main (int argc, char *argv[])
{
double rc;
char *tok;
char line [1024];
if (argc != 2) {
return 0;
}
tok = argv [1];
while (fgets (line, sizeof (line), stdin)) {
if (line [strlen (line) - 1] == '\n') {
line [strlen (line) - 1] = 0;
}
rc = lev_tok (line, tok);
printf ("see: tok=.%s. line=.%s. -> %.3f\n", line, tok, rc);
}
return 0;
}
#endif
/*
* the remaining function is taken from:
* http://www.merriampark.com/ldc.htm
* Lorenzo Seidenari (sixmoney@virgilio.it)
*/
#define minimum(a,b,c) min2(min2((a),(b)),(c))
/****************************************/
/*Implementation of Levenshtein distance*/
/****************************************/
static int
levenshtein_distance(char *s,char*t)
/*Compute levenshtein distance between s and t*/
{
//Step 1
int k,i,j,n,m,cost,*d,distance;
n=strlen(s);
m=strlen(t);
if(n!=0&&m!=0)
{
d=(int *)malloc((sizeof(int))*(m+1)*(n+1));
m++;
n++;
//Step 2
for(k=0;k<n;k++)
d[k]=k;
for(k=0;k<m;k++)
d[k*n]=k;
//Step 3 and 4
for(i=1;i<n;i++)
for(j=1;j<m;j++)
{
//Step 5
if(s[i-1]==t[j-1])
cost=0;
else
cost=1;
//Step 6
d[j*n+i]=minimum(d[(j-1)*n+i]+1,d[j*n+i-1]+1,d[(j-1)*n+i-1]+cost);
}
distance=d[n*m-1];
free(d);
return distance;
}
else
return -1; //a negative return value means that one or both strings are empty.
}
/* Local Variables: */
/* mode:c */
/* compile-command:"g++ -O -shared -I/usr/local/mysql/include/mysql -o udf_flevtok.so udf_flevtok.cc" */
/* End: */
/* end of udf_flevtok.cc */