Anagram finder in ML (OCaml)

This is the ML implementation of the anagram program. The problem and the solution is described in the main anagram article. Do read that article first.

This code is basically ML with some of the features of Objective Caml.

There are two modules, LetterReader, that does what it says, and Word which is similar to the Word classes or modules in other anagram implementations. There is also one class, AnagramFinder. Finally there is the main program.

A few identifiers have a leading underscore followed by a capital letter. This circumvents the rule that prevents these identifiers from starting with an uppercase letter.

In ML it is possible, even encouraged, not to specify types for identifiers. The compiler does a very good job of deducing types. But I find code easier to read if the types are explicit, so the code below have many explicit type specification that could have been left out.

Commented source code

Shared

A shortcut for printf, and few global constants.

let  printf      = Printf.printf
let _TEXT_SIZE   = 500*512   (*--- large enough for 500 * poem              *)
let _ACC_MINIMUM =  3        (*--- minimum # of words in an anagram set     *)
let _ACC_SIZE    = 20        (*--- fixed size of buffer to hold anagram set *)

Module LetterReader

This module reads the input and returns an uppercased string of letters.

get_char() reads a character from stdin and returns a character or EOF. get_letter() repeatedly calls get_char() until it gets a letter or EOF. read_letters() repeatedly calls get_letter() and stores the letters in its parameter buf, until it gets and EOF. read(), the only public routine, creates the input buffer, of type Bytes, calls read_letters(), and then returns the Bytes converted to an uppercased string.

Both get_letter() and read_letters() use tail recursion.

module LetterReader =
  struct
    type _CharOrEof = CHAR of char | EOF

    let get_char () : _CharOrEof =
      try
        CHAR(input_char stdin)
      with End_of_file ->
        EOF

    let is_letter (ch : char) : bool =
      ((ch >= 'a') && (ch <= 'z')) ||
      ((ch >= 'A') && (ch <= 'Z'))

    let rec get_letter () : _CharOrEof =
      match get_char () with
        CHAR(ch) ->
          if (is_letter ch) then
            CHAR(ch)
          else
            get_letter ()
      | EOF -> EOF

    let rec read_letters (buf : bytes) (put : int) : int =
      match (get_letter ()) with
        CHAR(ch) -> begin
          Bytes.set    buf  put ch;
          read_letters buf (put+1)
        end
      | EOF -> put                         (*--- return number of letters read *)

    let read () : string =
      let buf = Bytes.make (_TEXT_SIZE) ' ' in
      let l   = read_letters buf 0  in
      String.uppercase_ascii (Bytes.sub_string buf 0 l)
  end;;

Module Word

This module defines a type word, creates new words (new_word()), compares two words (compare_words()), and creates a unified version of a word (get_unified())

The function get_unified() is rather ugly as it needs both a char Array and a Bytes as temporaries, but I can’t see a way around them.

module Word =
  struct
    type word = { value : string; unified : string; offset : int }

    let comparisons = ref 0  (*--- Printed at end of run *)

    let get_unified (value : string) : string =
      let len   = String.length  value in
      let chars = Array.make len ' '   in
      let rval  = Bytes.make len ' '   in
      for i = 0 to (len - 1) do
        chars.(i) <- value.[i]         (*--- string     -> char array *)
      done;
      Array.sort Stdlib.compare chars; (*--- sort char array          *)
      for i = 0 to (len - 1) do
        Bytes.set rval i chars.(i)     (*--- char array -> bytes      *)
      done;
      Bytes.to_string rval             (*--- bytes      -> string     *)

    let new_word (value : string) (offset : int) : word =
      { value; unified = (get_unified value); offset }

    let compare_words (a : word) (b : word) =
      comparisons := !comparisons + 1;
      let cw = compare a.unified b.unified in
      if (0 <> cw) then
        cw
      else
        compare a.offset b.offset
  end;;

Class AnagramFinder

Just to show off, I made this into a class, not a module. Note that the method add_range(), present in the model solutions, has been dropped. The new methods no_overlap() and exists() make the code a little cleaner. The latter is tail recursive.

One thing is less than obvious. The method find_all() calculates nwords, the number of substrings of length len in text. It then allocates an array with an extra element. All elements are initialized with a word that has both value and unified set to "xxx". All of them, except the last, will be replaced as the array is filled with substrings from text. The surviving xxx will stay in position after sorting, since xxx is greater than any uppercase string. But why? you ask. Because with that guard post in place, the last substring, the next to last element of words, does not need special treatment.

class _AnagramFinder (llen : int) =
  object (self)
    val mutable accepted = Array.make _ACC_SIZE (Word.new_word "a" 0)
    val mutable put        : int = 0
    val mutable len        : int = llen

    method print_one_result () : unit =
      printf "%3d: " put;
      for n = 0 to (put-1) do
        let w = accepted.(n) in
        printf " %4d: %s" w.offset w.value
      done;
      print_endline ""

    method no_overlap (cand : Word.word) =
      (cand.offset >= accepted.(put-1).offset+len)

    method exists (cand : Word.word) (offset : int) =
      (offset < put)
        && ((0 == (compare accepted.(offset).value cand.value))
          || self#exists cand (offset + 1))

    method add_candidate (cand : Word.word) : unit =
      if (put == 0) || ((self#no_overlap cand) && (not (self#exists cand 0))) then begin
        accepted.(put) <- cand;
        put            <- put + 1
      end

    method find_all (text : string) : unit =
      put <- 0;
      let nwords = String.length text - len + 1 in
      let words  = Array.make (nwords+1) (Word.new_word "xxx" 0) in
      for offset = 0 to (nwords - 1) do
        words.(offset) <- (Word.new_word (String.sub text offset len) offset)
      done;
      Array.sort Word.compare_words words;
      for n = 0 to (Array.length words) - 2 do
        self#add_candidate words.(n);
        if (0 <> (compare words.(n).unified words.(n+1).unified)) then begin
          if (put >= _ACC_MINIMUM) then self#print_one_result();
          put <- 0;
        end
      done
  end;;

The main program

let main (argc : int) (argv : string array) =
  let text     = LetterReader.read() in
  let first    = int_of_string argv.(1) in
  let last     = int_of_string argv.(2) in
  for n = first to last do
    if (n > first) then print_endline "";
    printf "len = %d\n" n;
    (new _AnagramFinder n)#find_all text
  done;
  printf "string length = %5d, comparisons = %5d\n" (String.length text) !Word.comparisons
;;

let argc = Array.length Sys.argv in
  main argc Sys.argv;;


Full source code

(*----------------------------------------------------------------------------- !; 100-globals.ml ml *)

let  printf      = Printf.printf
let _TEXT_SIZE   = 500*512   (*--- large enough for 500 * poem              *)
let _ACC_MINIMUM =  3        (*--- minimum # of words in an anagram set     *)
let _ACC_SIZE    = 20        (*--- fixed size of buffer to hold anagram set *)

(* ----------------------------------------------------------------------------- !; 110-read-text.ml *)
(* --------------------------------------------------------------------------------- LetterReader *)
module LetterReader =
  struct
    type _CharOrEof = CHAR of char | EOF

    let get_char () : _CharOrEof =
      try
        CHAR(input_char stdin)
      with End_of_file ->
        EOF

    let is_letter (ch : char) : bool =
      ((ch >= 'a') && (ch <= 'z')) ||
      ((ch >= 'A') && (ch <= 'Z'))

    let rec get_letter () : _CharOrEof =
      match get_char () with
        CHAR(ch) ->
          if (is_letter ch) then
            CHAR(ch)
          else
            get_letter ()
      | EOF -> EOF

    let rec read_letters (buf : bytes) (put : int) : int =
      match (get_letter ()) with
        CHAR(ch) -> begin
          Bytes.set    buf  put ch;
          read_letters buf (put+1)
        end
      | EOF -> put                         (*--- return number of letters read *)

    let read () : string =
      let buf = Bytes.make (_TEXT_SIZE) ' ' in
      let l   = read_letters buf 0  in
      String.uppercase_ascii (Bytes.sub_string buf 0 l)
  end;;

(* ----------------------------------------------------------------------------- !; 120-type-word.ml *)
(* ----------------------------------------------------------------------------------------- Word *)
module Word =
  struct
    type word = { value : string; unified : string; offset : int }

    let comparisons = ref 0  (*--- Printed at end of run *)

    let get_unified (value : string) : string =
      let len   = String.length  value in
      let chars = Array.make len ' '   in
      let rval  = Bytes.make len ' '   in
      for i = 0 to (len - 1) do
        chars.(i) <- value.[i]         (*--- string     -> char array *)
      done;
      Array.sort Stdlib.compare chars; (*--- sort char array          *)
      for i = 0 to (len - 1) do
        Bytes.set rval i chars.(i)     (*--- char array -> bytes      *)
      done;
      Bytes.to_string rval             (*--- bytes      -> string     *)

    let new_word (value : string) (offset : int) : word =
      { value; unified = (get_unified value); offset }

    let compare_words (a : word) (b : word) =
      comparisons := !comparisons + 1;
      let cw = compare a.unified b.unified in
      if (0 <> cw) then
        cw
      else
        compare a.offset b.offset
  end;;

(* ------------------------------------------------------------------- !; 130-type-anagram-finder.ml *)
(* -------------------------------------------------------------------------------- AnagramFinder *)

class _AnagramFinder (llen : int) =
  object (self)
    val mutable accepted = Array.make _ACC_SIZE (Word.new_word "a" 0)
    val mutable put        : int = 0
    val mutable len        : int = llen

    method print_one_result () : unit =
      printf "%3d: " put;
      for n = 0 to (put-1) do
        let w = accepted.(n) in
        printf " %4d: %s" w.offset w.value
      done;
      print_endline ""

    method no_overlap (cand : Word.word) =
      (cand.offset >= accepted.(put-1).offset+len)

    method exists (cand : Word.word) (offset : int) =
      (offset < put)
        && ((0 == (compare accepted.(offset).value cand.value))
          || self#exists cand (offset + 1))

    method add_candidate (cand : Word.word) : unit =
      if (put == 0) || ((self#no_overlap cand) && (not (self#exists cand 0))) then begin
        accepted.(put) <- cand;
        put            <- put + 1
      end

    method find_all (text : string) : unit =
      put <- 0;
      let nwords = String.length text - len + 1 in
      let words  = Array.make (nwords+1) (Word.new_word "xxx" 0) in
      for offset = 0 to (nwords - 1) do
        words.(offset) <- (Word.new_word (String.sub text offset len) offset)
      done;
      Array.sort Word.compare_words words;
      for n = 0 to (Array.length words) - 2 do
        self#add_candidate words.(n);
        if (0 <> (compare words.(n).unified words.(n+1).unified)) then begin
          if (put >= _ACC_MINIMUM) then self#print_one_result();
          put <- 0;
        end
      done
  end;;

(* ----------------------------------------------------------------------------------------- !; 140-main.ml *)

let main (argc : int) (argv : string array) =
  let text     = LetterReader.read() in
  let first    = int_of_string argv.(1) in
  let last     = int_of_string argv.(2) in
  for n = first to last do
    if (n > first) then print_endline "";
    printf "len = %d\n" n;
    (new _AnagramFinder n)#find_all text
  done;
  printf "string length = %5d, comparisons = %5d\n" (String.length text) !Word.comparisons
;;

let argc = Array.length Sys.argv in
  main argc Sys.argv;;

(*--- __END__ ----------------------------------------------------------------------------------- *)

(*--- Note! "ocaml unix.cma anagram.ml" makes Sys.interactive false;          *)
(*---   This use is not considered interactive.                               *)
(*---   There is no simple, obvious way to check for ocaml vs. compiled code. *)
(*---   Perhaps using regexps to check for /ocaml in argv[0]...               *)

You can reach me by email at “lars dash 7 dot sdu dot se” or by telephone +46 705 189090

View source for the content of this page.