/* * stringsx.c * * (C) 2011 jnw@cpan.org, * Distribute under MIT or any GPL license. * * A simplified strings tool, similar to the tool that * comes with gnu binutils, but with the following differences * * - no -e switch. We support all encodings simultaneously * - '\0' characters are stripped, and have no effect, unless * multiple '\0' charachters occur in a row. * - adjustable fuzzyness: 3 chars in a row with their 8th bit * set are accepted, control chars except '\t', '\n', '\r' * always cut a string. * - Strings need not be '\0' terminated. * - no support for file sections. We always scan the entire file. * * Implemented in both perl and C. Compile the C version, if you * find significant speed issues with the perl version. * * 2011-11-01, jnw@cpan.org * 2012-08-23, jw, no more string termination with \f */ #include #include #include #include int main(int ac, char **av) { int minlen = 10; int badcut = 3*1; // 3 chars of badness 1, or similar if (!av[1]) { fprintf(stderr, "Usage: %s file\n", av[0]); exit(1); } FILE *fp = strcmp(av[1], "-") ? fopen(av[1], "r") : stdin; if (!fp) { fprintf(stderr, "%s: %s\n", av[1], strerror(errno)); exit(1); } int ch; int badcount = 0; int printing = 0; int queuelen = 0; int nulseen = 0; char queuebuf[20]; while ((ch = getc(fp)) != EOF) { int badness = 0; if (ch == 0) { nulseen++; // a nul every second char is just fine. if (nulseen > 1) badness = badcut+1; } else { nulseen = 0; if (ch > 127) badness = 1; // latin1 or utf8 byte else if (ch < 32 && ch != '\t' && ch != '\n' && ch != '\r') badness = badcut+1; // control char. else /* (good char) */ badness = 0; } badcount += badness; if (!printing && !badness) { queuebuf[queuelen] = ch; if (ch) queuelen++; // always skip \0 bytes if (queuelen >= minlen) { int j; for (j = 0; j < queuelen; j++) putchar(queuebuf[j]); queuelen = 0; printing = 1; } continue; } if (printing) { if (!badness && ch) { if (queuelen) { int j; for (j = 0; j < queuelen; j++) putchar(queuebuf[j]); queuelen = 0; } queuelen = 0; badcount = 0; putchar(ch); } else { queuebuf[queuelen] = ch; if (ch) queuelen++; // always skip \0 bytes if (badcount >= badcut) { queuelen = 0; printing = 0; badcount = 0; putchar('\n'); // next string. // putchar('\f'); // next string. \f often confuses less. } } } } fclose(fp); return 0; }