Anagram finder in C++

This is the C++ implementation of the anagram program. The problem and the solution is described in the main anagram article. Do read that article first.

This implementation is the starting point for the other implementations. Once you understand this implementation, then you should have no problem with the other.

class Word

Member data in Word.

  • int offset The offset into the letter-only string. Used for eliminating overlapping candidates.
  • string value The word at offset (std::string).
  • string unified The unified version of value (std::string).

Member functions in Word.

  • Word(int offset, string value, string unified) The constructor. There is not destructor, since no memory is explicitly allocated.

class AnagramFinder

Member data in AnagramFinder.

  • int len The length of the anagrams that this AnagramFinder looks for.
  • vector<Word *> words All the possible words. If there are 512 letters and len is 4, then there are 509 words.
  • vector<Word *> accepted All successful candidates for a single set of anagram words.

Member functions in AnagramFinder.

  • AnagramFinder(char *text, int len) Constructs a new finder for the given length. Initializes words, but does not sort it.
  • static bool compare_words(Word const *a, Word const *b) Passed to std::sort()
  • static std::string get_unified(std::string const &word) Returns the letters in word sorted in increasing order.
  • void print_one_result() Prints the words in accepted.
  • void add_candidate(Word *cand) Tries to add a candidate to accepted. Rejects duplicates of words already in the list. Rejects words that overlap words already in the list.
  • void add_range(int begin, int end) Passes each of the words in the range to add_candidate.
  • void find_all() Finds all the anagram set of with words of length len.

Global data and functions.

  • int read_text(char *text, int allocated) Reads all of the input data, discards all characters except letters, and converts all lower-case letter to upper-case letters.
  • int main(int argc, char *argv[]) Parses command line arguments. There may be s single integer for a single run of the given length, or a range given by two numbers. The default is a run for length 6. A new AnagramFinder is created, run and deleted for each length.

The source code

# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <ctype.h>
# include <assert.h>

# include <algorithm>
# include <vector>
# include <string>

# define VECTSIZE(x) ((sizeof (x)) / (sizeof (x)[0]))
# define MINIMUM 3

static struct {
    int comparisons;
} debug;

//--------------------------------------------------------------------------------------- Word

struct Word {
    Word(int offset, std::string const &value, std::string const &unified)
      : offset (offset),
        value  (value ),
        unified(unified) { }

    int         const offset;
    std::string const value;
    std::string const unified;
    void print() { printf("%4d %s %s\n", offset, value.c_str(), unified.c_str()); }

};

//--------------------------------------------------------------------------------------- AnagramFinder

class AnagramFinder {
  private:
    int                 len;
    std::vector<Word *> words;   //--- Owner of each Word
    std::vector<Word *> accepted; //--- Not owner
    int                 low_offset = 0;

  public:
    AnagramFinder(char *text, int len) : len(len) {
        int last = strlen(text) - len;
        words.reserve(last+1);
        for (int n = 0; n <= last; n++) {
            std::string w(text+n, len);
            words.push_back(new Word(n, w, get_unified(w)));
        }
    }

   ~AnagramFinder() {
        for (Word *it : words  ) delete it;
    }

    static bool compare_words(Word const *a, Word const *b) {
        debug.comparisons++;
        int cmp = a->unified.compare(b->unified);
        return cmp ? (cmp < 0) : (a->offset < b->offset);
    }

    static std::string get_unified(std::string const &s) {
        char tmp[40];
        assert(s.length() < VECTSIZE(tmp));
        std::copy(s.begin(), s.end(), tmp);
        tmp[s.length()] = 0;
        std::sort(tmp, tmp+strlen(tmp));
        return tmp;
    }

    void print_one_result() {
        printf("%3lu:", 0lu + accepted.size());
        for (auto it : accepted)
            printf(" %4d:%s", it->offset, it->value.c_str());
        printf("\n");
    }

    void add_candidate(Word *cand) {
        if (cand->offset < low_offset)
            return;
        for (auto it : accepted)
            if (it->value == cand->value)
                return;
        accepted.push_back(cand);
        low_offset = cand->offset + len;
    }

    void add_range(int begin, int end) {
        if (end-begin >= MINIMUM) {
            accepted.clear();
            low_offset = 0;
            for (int n = begin; n < end; n++)
                add_candidate(words[n]);
            if (accepted.size() >= MINIMUM)
                print_one_result();
        }
    }

    void find_all() {
        std::sort(words.begin(), words.end(), compare_words);

        int begin = 0;
        for (unsigned n = 1; n < words.size(); n++) {
            if (words[n]->unified != words[begin]->unified) {
                add_range(begin, n);
                begin = n;
            }
        }
        add_range(begin, words.size());
    }
};

//--------------------------------------------------------------------------------------- main program

static int read_text(char *text, int allocated)
{
    int ch;
    int end = 0;
    while (EOF != (ch = getchar())) {
        if (isalpha(ch)) {
            assert(end < allocated-1);
            text[end++] = toupper(ch);
        }
    }
    text[end] = 0;
    return end;
}

int main(int argc, char *argv[])
{
    char text[500 * 512 + 1];
    int from     = (argc > 1) ? atoi(argv[1]) : 6;
    int to       = (argc > 2) ? atoi(argv[2]) : from;

    int text_len = read_text(text, VECTSIZE(text));

    for (int len = from; len <= to; len++) {
        AnagramFinder finder(text, len);
        if (len > from)
            printf("\n");
        printf("len = %d\n", len);
        finder.find_all();
    }

    printf("string length = %5d, comparisons = %d\n", text_len, debug.comparisons);
    return 0;
}

You can reach me by email at “lars dash 7 dot sdu dot se” or by telephone +46 705 189090

View source for the content of this page.