Code Listings

Chapter 24: Regular Expressions

The Regular Expression samples are also preloaded into the LINQPad utility.

Go to cookbook

Regular expression basics:
Console.WriteLine (Regex.Match ("color",   @"colou?r").Success);  // True
Console.WriteLine (Regex.Match ("colour",  @"colou?r").Success);  // True
Console.WriteLine (Regex.Match ("colouur", @"colou?r").Success);  // False

Match m = Regex.Match ("any colour you like", @"colou?r");

Console.WriteLine (m.Success);     // True
Console.WriteLine (m.Index);       // 4
Console.WriteLine (m.Length);      // 6
Console.WriteLine (m.Value);       // colour
Console.WriteLine (m.ToString());  // colour
Match m1 = Regex.Match ("One color? There are two colours in my head!",
                        @"colou?rs?");
Match m2 = m1.NextMatch();
Console.WriteLine (m1);         // color
Console.WriteLine (m2);         // colours
foreach (Match m in Regex.Matches
    ("One color? There are two colours in my head!", @"colou?rs?"))
  Console.WriteLine (m);
Console.WriteLine (Regex.IsMatch ("Jenny", "Jen(ny|nifer)?"));  // True

Compiled regular expressions:

Regex r = new Regex (@"sausages?");
Console.WriteLine (r.Match ("sausage"));   // sausage
Console.WriteLine (r.Match ("sausages"));  // sausages

Regex options:

Console.WriteLine (Regex.Match ("a", "A", RegexOptions.IgnoreCase)); // a
Console.WriteLine (Regex.Match ("a", @"(?i)A"));                     // a
Console.WriteLine (Regex.Match ("AAAa", @"(?i)a(?-i)a"));            // Aa

Character escapes:

Console.WriteLine (Regex.Match ("what?", @"what\?")); // what? (correct)
Console.WriteLine (Regex.Match ("what?", @"what?"));  // what  (incorrect)
Console.WriteLine (Regex.Escape   (@"?"));     // \?
Console.WriteLine (Regex.Unescape (@"\?"));    // ?
Console.WriteLine (Regex.Match ("\\", "\\\\"));    // \
Console.WriteLine (Regex.IsMatch ("hello world", @"hello world"));  // True

Character sets:

Console.Write (Regex.Matches ("That is that.", "[Tt]hat").Count);    // 2
Console.Write (Regex.Match   ("quiz qwerty", "q[^aeiou]").Index);    // 5
Console.Write (Regex.Match   ("b1-c4", @"[a-h]\d-[a-h]\d").Success); // True
Console.Write (Regex.IsMatch ("Yes, please", @"\p{P}"));             // True

Quantifiers:

Console.Write (Regex.Match ("cv15.doc", @"cv\d*\.doc").Success);      // True
Console.Write (Regex.Match ("cvjoint.doc", @"cv.*\.doc").Success);    // True
Console.Write (Regex.Matches ("slow! yeah slooow!", "slo+w").Count);  // 2
Regex bp = new Regex (@"\d{2,3}/\d{2,3}");
Console.WriteLine (bp.Match ("It used to be 160/110"));  // 160/110
Console.WriteLine (bp.Match ("Now it's only 115/75"));   // 115/75

Greedy vs. lazy quantifiers:

string html = "<i>By default</i> quantifiers are <i>greedy</i> creatures";

foreach (Match m in Regex.Matches (html, @"<i>.*</i>"))
  Console.WriteLine (m);
foreach (Match m in Regex.Matches (html, @"<i>.*?</i>"))
  Console.WriteLine (m);

Lookahead and Lookbehind:

Console.WriteLine (Regex.Match ("say 25 miles more", @"\d+\s(?=miles)"));
Console.WriteLine (Regex.Match ("say 25 miles more", @"\d+\s(?=miles).*"));
string password = "...";
bool ok = Regex.IsMatch (password, @"(?=.*\d).{6,}");
string regex = "(?i)good(?!.*(however|but))";
Console.WriteLine (Regex.IsMatch ("Good work! But...",  regex));  // False
Console.WriteLine (Regex.IsMatch ("Good work! Thanks!", regex));  // True
string regex = "(?i)(?<!however.*)good";
Console.WriteLine (Regex.IsMatch ("However good, we...", regex));  // False
Console.WriteLine (Regex.IsMatch ("Very good, thanks!" , regex));  // True

Anchors:

Console.WriteLine (Regex.Match ("Not now", "^[Nn]o"));   // No
Console.WriteLine (Regex.Match ("f = 0.2F", "[Ff]$"));   // F 
string fileNames = "a.txt" + "\r\n" + "b.doc" + "\r\n" + "c.txt";
string r = @".+\.txt(?=\r?$)";

foreach (Match m in Regex.Matches (fileNames, r, RegexOptions.Multiline))
  Console.Write (m + " ");
MatchCollection emptyLines = Regex.Matches (s, "^(?=\r?$)",
                                            RegexOptions.Multiline);

MatchCollection blankLines = Regex.Matches (s, "^[ \t]*(?=\r?$)",
                                            RegexOptions.Multiline);

Word boundaries:

foreach (Match m in Regex.Matches ("Wedding in Sarajevo", @"\b\w+\b"))
  Console.WriteLine (m);

int one = Regex.Matches ("Wedding in Sarajevo", @"\bin\b").Count; // 1
int two = Regex.Matches ("Wedding in Sarajevo", @"in").Count;     // 2

string text = "Don't loose (sic) your cool";
Console.Write (Regex.Match (text, @"\b\w+\b\s(?=\(sic\))"));      // loose

Groups:

Match m = Regex.Match ("206-465-1918", @"(\d{3})-(\d{3}-\d{4})");

Console.WriteLine (m.Groups[1]);   // 206
Console.WriteLine (m.Groups[2]);   // 465-1918

Console.WriteLine (m.Groups[0]);   // 206-465-1918
Console.WriteLine (m);             // 206-465-1918
foreach (Match m in Regex.Matches ("pop pope peep", @"\b(\w)\w+\1\b"))
  Console.Write (m + " ");  // pop peep

Named groups

:
string regEx =
  @"\b"             +  // word boundary
  @"(?'letter'\w)"  +  // match first letter, and name it 'letter'
  @"\w+"            +  // match middle letters
  @"\k'letter'"     +  // match last letter, denoted by 'letter'
  @"\b";               // word boundary

foreach (Match m in Regex.Matches ("bob pope peep", regEx))
  Console.Write (m + " ");  // bob peep
string regFind = 
  @"<(?'tag'\w+?).*>" +  // match first tag, and name it 'tag'
  @"(?'text'.*?)"     +  // match text content, name it 'text'
  @"</\k'tag'>";         // match last tag, denoted by 'tag'

Match m = Regex.Match ("<h1>hello</h1>", regFind);

Console.WriteLine (m.Groups ["tag"]);          // h1
Console.WriteLine (m.Groups ["text"]);         // hello

Replacing and splitting text:

string find = @"\bcat\b";
string replace = "dog";

Console.WriteLine (Regex.Replace ("catapult the cat", find, replace));
string text = "10 plus 20 makes 30";
Console.WriteLine (Regex.Replace (text, @"\d+", @"<$0>"));
string regFind = 
  @"<(?'tag'\w+?).*>" +  // match first tag, and name it 'tag'
  @"(?'text'.*?)"     +  // match text content, name it 'text'
  @"</\k'tag'>";         // match last tag, denoted by 'tag'

string regReplace =
  @"<${tag}"         +  // <tag
  @" value="""       +  // value="
  @"${text}"         +  // text
  @"""/>";              // "/>

Console.Write (Regex.Replace ("<msg>hello</msg>", regFind, regReplace));

MatchEvaluator delegate:

Console.WriteLine (Regex.Replace ("5 is less than 10", @"\d+",
                   m => (int.Parse (m.Value) * 10).ToString()) );

Splitting text:

foreach (string s in Regex.Split ("a5b7c", @"\d"))
  Console.Write (s + " ");     // a b c

foreach (string s in Regex.Split ("oneTwoThree", @"(?=[A-Z])"))
  Console.Write (s + " ");    // one Two Three

Regular Expressions Cookbook

Matching U.S. Social Security number/phone number:

string ssNum = @"\d{3}-\d{2}-\d{4}";

Console.WriteLine (Regex.IsMatch ("123-45-6789", ssNum));      // True

string phone = @"(?x)
  ( \d{3}[-\s] | \(\d{3}\)\s? )
    \d{3}[-\s]?
    \d{4}";

Console.WriteLine (Regex.IsMatch ("123-456-7890",   phone));   // True
Console.WriteLine (Regex.IsMatch ("(123) 456-7890", phone));   // True

Extracting “name = value” pairs (one per line):

string r = @"(?m)^\s*(?'name'\w+)\s*=\s*(?'value'.*)\s*(?=\r?$)";

string text =
  @"id = 3
    secure = true
    timeout = 30";

foreach (Match m in Regex.Matches (text, r))
  Console.WriteLine (m.Groups["name"] + " is " + m.Groups["value"]);

id is 3
secure is true
timeout is 30

Strong password validation:

string r = @"(?x)^(?=.* ( \d | \p{P} | \p{S} )).{6,}";

Console.WriteLine (Regex.IsMatch ("abc12", r));     // False
Console.WriteLine (Regex.IsMatch ("abcdef", r));    // False
Console.WriteLine (Regex.IsMatch ("ab88yz", r));    // True

Lines of at least 80 characters:

string r = @"(?m)^.{80,}(?=\r?$)";

string fifty = new string ('x', 50);
string eighty = new string ('x', 80);

string text = eighty + "\r\n" + fifty + "\r\n" + eighty;

Console.WriteLine (Regex.Matches (text, r).Count);   // 2

Parsing dates/times (N/N/N H:M:S AM/PM):

string r = @"(?x)(?i)
 (\d{1,4}) [./-]
 (\d{1,2}) [./-]
 (\d{1,4}) [\sT]  (\d+):(\d+):(\d+) \s? (A\.?M\.?|P\.?M\.?)?";

string text = "01/02/2008 5:20:50 PM";

foreach (Group g in Regex.Match (text, r).Groups)
  Console.WriteLine (g.Value + " ");

01/02/2008 5:20:50 PM
01
02
2008
5
20
50
PM

Matching Roman numerals:

string r =
  @"(?i)\bm*"         +
  @"(d?c{0,3}|c[dm])" +
  @"(l?x{0,3}|x[lc])" +
  @"(v?i{0,3}|i[vx])" +
  @"\b";

Console.WriteLine (Regex.IsMatch ("MCMLXXXIV", r));   // True

Removing repeated words:

string r = @"(?'dupe'\w+)\W\k'dupe'";

string text = "In the the beginning...";
Console.WriteLine (Regex.Replace (text, r, "${dupe}"));

In the beginning

Word count:

string r = @"\b(\w|[-'])+\b";

string text = "It's all mumbo-jumbo to me";
Console.WriteLine (Regex.Matches (text, r).Count);   // 5

Matching a Guid:

string r =
  @"(?i)\b"           +
  @"[0-9a-fA-F]{8}\-" +
  @"[0-9a-fA-F]{4}\-" +
  @"[0-9a-fA-F]{4}\-" +
  @"[0-9a-fA-F]{4}\-" +
  @"[0-9a-fA-F]{12}"  +
  @"\b";

string text = "Its key is {3F2504E0-4F89-11D3-9A0C-0305E82C3301}.";
Console.WriteLine (Regex.Match (text, r).Index);                    // 12

Parsing an XML tag:

string r = 
  @"<(?'tag'\w+?).*>"  +  // match first tag, and name it 'tag'
  @"(?'text'.*?)"      +  // match text content, name it 'text'
  @"</\k'tag'>";          // match last tag, denoted by 'tag'

string text = "<h1>hello</h1>";

Match m = Regex.Match (text, r);

Console.WriteLine (m.Groups ["tag"]);       // h1
Console.WriteLine (m.Groups ["text"]);      // hello

Splitting a camel-cased word:

string r = @"(?=[A-Z])";

foreach (string s in Regex.Split ("oneTwoThree", r))
  Console.Write (s + " ");    // one Two Three

Obtaining a legal filename:

string input = "My \"good\" <recipes>.txt";

char[] invalidChars = System.IO.Path.GetInvalidPathChars();
string invalidString = Regex.Escape (new string (invalidChars));

string valid = Regex.Replace (input, "[" + invalidString + "]", "");
Console.WriteLine (valid);

My good recipes.txt

Escaping Unicode characters for HTML:

string htmlFragment = "© 2007";

string result = Regex.Replace (htmlFragment, @"[\u0080-\uFFFF]",
                m => @"&#" + ((int)m.Value[0]).ToString() + ";");

Console.WriteLine (result);        // &#169; 2007

 

© 2007, O'Reilly Media, Inc. All rights reserved

C# 3.0 in a Nutshell
Buy from amazon.com Available now