Removing and Cleaning search content to prevent DoS attacks

There are many ways to search a system, what we have to be careful of is making sure that the search does not cause a DoS (Denial of Service) attack.

So following on from my article Denial Of Service (DoS) attacks via SQL Wildcards should be prevented here is a method to remove and clean up the search to prevent such an attack.

using System;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections.Specialized;

public class Search
{
private readonly Regex RegexStripHtml = new Regex("<[^>]*>", RegexOptions.Compiled);

private StringCollection StopWords
{
    get
    {
        var stopWords = new StringCollection();
        // Add your stopword here, or get them from a data source
        return stopWords;
    }
}

public string CleanContent(string content, bool removeHtml)
{
    if (removeHtml)
    {
        content = StripHtml(content);
    }

    content =
        content.Replace("\\", string.Empty).Replace("|", string.Empty).Replace("(", string.Empty).Replace(
            ")", string.Empty).Replace("[", string.Empty).Replace("]", string.Empty).Replace("*", string.Empty).
            Replace("?", string.Empty).Replace("}", string.Empty).Replace("{", string.Empty).Replace(
                "^", string.Empty).Replace("+", string.Empty).Replace("%", string.Empty).Replace("_", string.Empty);

    var words = content.Split(new[] { ' ', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
    var sb = new StringBuilder();
    foreach (var word in
        words.Select(t => t.ToLowerInvariant().Trim()).Where(word => word.Length > 1 && !StopWords.Contains(word)))
    {
        sb.AppendFormat("{0} ", word);
    }

    return sb.ToString().Trim();
}

private string StripHtml(string html)
{
    return StringIsNullOrWhitespace(html) ? string.Empty : RegexStripHtml.Replace(html, string.Empty).Trim();
}

private bool StringIsNullOrWhitespace(string value)
{
    return ((value == null) || (value.Trim().Length == 0));
}

Some of this code was cribbed from the BlogEngine

Here are some Unit Tests to test the CleanCode method, yes Unit Tests, this is the perfect method to perform and under how the methodworks.

using Microsoft.VisualStudio.TestTools.UnitTesting;
using Capita.Dolphin.Web.Helpers;

/// <summary>
/// Summary description for HelperSearchTests
/// </summary>
[TestClass]
public class HelperSearchTests
{
    [TestMethod]
    public void CleanContentValidCharacters()
    {
        // Assign
        var search = new Search();
        const string expected = "hello moon";
            
        // Act
        var actual = search.CleanContent("Hello moon", false);

        // Assert
        Assert.AreEqual(expected, actual);
    }

    [TestMethod]
    public void CleanContentInValidCharacters()
    {
        // Assign
        var search = new Search();
        const string expected = "hello moon";

        // Act
        var actual = search.CleanContent("Hello moo%*([|+^}{)n", false);

        // Assert
        Assert.AreEqual(expected, actual);
    }

    [TestMethod]
    public void CleanContentRemoveHTMLCharacters()
    {
        // Assign
        var search = new Search();
        const string expected = "hello moon";

        // Act
        var actual = search.CleanContent("<p>Hello moon</p>", true);

        // Assert
        Assert.AreEqual(expected, actual);
    }

    [TestMethod]
    public void CleanContentRemoveExtraWildCardsCharacters()
    {
        // Assign
        var search = new Search();
        const string expected = "hello moon";

        // Act
        var actual = search.CleanContent("[][^]_%Hello moon</p>", true);

        // Assert
        Assert.AreEqual(expected, actual);
    }
}