Regular Expressions
Overview
C++11 introduced the <regex> library for pattern matching and text processing. This chapter covers regex syntax, matching, searching, replacing, and performance considerations.
┌────────────────────────────────────────────────────────┐
│ REGEX LIBRARY COMPONENTS │
├────────────────────────────────────────────────────────┤
│ │
│ MATCHING │ SEARCHING │ REPLACING │
│ ──────── │ ───────── │ ───────── │
│ • regex_match │ • regex_search │ • regex_replace │
│ • Full match │ • Find first │ • Substitute │
│ • Strict │ • Sub-patterns │ • Format │
│ │
│ ITERATORS │ SYNTAX │ FLAGS │
│ ───────── │ ────── │ ───── │
│ • sregex_iter │ • ECMAScript │ • icase │
│ • token_iter │ • basic │ • multiline │
│ • Find all │ • extended │ • optimize │
│ │ • grep, awk │ │
│ │
└────────────────────────────────────────────────────────┘
Regex Basics
Creating Regex Objects
#include <regex>
#include <iostream>
int main() {
// Basic regex
std::regex pattern("hello");
// With flags
std::regex case_insensitive("HELLO", std::regex::icase);
// Different grammar
std::regex ecma_pattern("\\d+", std::regex::ECMAScript); // Default
std::regex basic_pattern("\\d+", std::regex::basic);
std::regex extended_pattern("[0-9]+", std::regex::extended);
// Optimize for repeated use
std::regex optimized("pattern", std::regex::optimize);
// Raw string literals (avoid escaping backslashes)
std::regex email(R"(\w+@\w+\.\w+)"); // No need for \\w
return 0;
}
Common Regex Patterns
┌────────────────────────────────────────────────────────┐
│ Common Regex Patterns │
├────────────────────────────────────────────────────────┤
│ Pattern │ Matches │
├───────────────┼────────────────────────────────────────┤
│ . │ Any character except newline │
│ \d │ Digit [0-9] │
│ \w │ Word character [A-Za-z0-9_] │
│ \s │ Whitespace [ \t\n\r\f\v] │
│ \D \W \S │ Negated versions │
│ ^ │ Start of string/line │
│ $ │ End of string/line │
│ [abc] │ Character class (a, b, or c) │
│ [^abc] │ Negated class (not a, b, or c) │
│ (abc) │ Capture group │
│ (?:abc) │ Non-capturing group │
│ a|b │ Alternation (a or b) │
│ * │ 0 or more (greedy) │
│ + │ 1 or more (greedy) │
│ ? │ 0 or 1 (greedy) │
│ {n} │ Exactly n │
│ {n,} │ n or more │
│ {n,m} │ Between n and m │
│ *? +? ?? │ Non-greedy versions │
└────────────────────────────────────────────────────────┘
regex_match - Full String Matching
Basic Matching
#include <regex>
#include <string>
#include <iostream>
int main() {
std::string text = "hello123";
// Check if entire string matches
if (std::regex_match(text, std::regex("hello\\d+"))) {
std::cout << "Matched!\n";
}
// Case insensitive
if (std::regex_match("HELLO", std::regex("hello", std::regex::icase))) {
std::cout << "Case insensitive match!\n";
}
// Must match entire string
if (!std::regex_match("hello123world", std::regex("hello\\d+"))) {
std::cout << "Doesn't match (extra 'world')\n";
}
return 0;
}
Capturing Groups
#include <regex>
#include <string>
#include <iostream>
int main() {
std::string text = "2024-12-04";
std::regex pattern(R"((\d{4})-(\d{2})-(\d{2}))");
std::smatch matches;
if (std::regex_match(text, matches, pattern)) {
std::cout << "Full match: " << matches[0] << '\n'; // 2024-12-04
std::cout << "Year: " << matches[1] << '\n'; // 2024
std::cout << "Month: " << matches[2] << '\n'; // 12
std::cout << "Day: " << matches[3] << '\n'; // 04
}
return 0;
}
Named Captures (via numbered)
#include <regex>
#include <string>
// C++ doesn't have named groups, but you can document them
int main() {
std::string email = "user@example.com";
std::regex pattern(R"((\w+)@(\w+)\.(\w+))");
std::smatch matches;
if (std::regex_match(email, matches, pattern)) {
// Document which index is which
const int USER = 1;
const int DOMAIN = 2;
const int TLD = 3;
std::cout << "User: " << matches[USER] << '\n';
std::cout << "Domain: " << matches[DOMAIN] << '\n';
std::cout << "TLD: " << matches[TLD] << '\n';
}
return 0;
}
regex_search - Finding Patterns
Basic Search
#include <regex>
#include <string>
#include <iostream>
int main() {
std::string text = "Contact us at support@example.com or sales@example.com";
std::regex pattern(R"(\w+@\w+\.\w+)");
std::smatch match;
// Find first occurrence
if (std::regex_search(text, match, pattern)) {
std::cout << "Found email: " << match[0] << '\n';
std::cout << "Position: " << match.position() << '\n';
std::cout << "Length: " << match.length() << '\n';
// Text before and after match
std::cout << "Prefix: " << match.prefix() << '\n';
std::cout << "Suffix: " << match.suffix() << '\n';
}
return 0;
}
Finding All Matches
#include <regex>
#include <string>
#include <iostream>
int main() {
std::string text = "Prices: $10, $20.50, $3.99";
std::regex pattern(R"(\$\d+(?:\.\d{2})?)");
// Method 1: Using regex_search in loop
std::smatch match;
std::string::const_iterator search_start(text.cbegin());
while (std::regex_search(search_start, text.cend(), match, pattern)) {
std::cout << "Found: " << match[0] << '\n';
search_start = match.suffix().first;
}
// Method 2: Using regex_iterator (better)
auto begin = std::sregex_iterator(text.begin(), text.end(), pattern);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
std::cout << "Found: " << it->str() << '\n';
}
return 0;
}
regex_replace - Substitution
Basic Replacement
#include <regex>
#include <string>
#include <iostream>
int main() {
std::string text = "Hello, World!";
std::regex pattern("World");
// Replace first occurrence
std::string result = std::regex_replace(text, pattern, "Universe");
std::cout << result << '\n'; // "Hello, Universe!"
// Replace all occurrences
std::string numbers = "1, 2, 3, 4, 5";
std::string replaced = std::regex_replace(numbers, std::regex("\\d"), "X");
std::cout << replaced << '\n'; // "X, X, X, X, X"
return 0;
}
Using Capture Groups in Replacement
#include <regex>
#include <string>
#include <iostream>
int main() {
std::string text = "2024-12-04";
std::regex pattern(R"((\d{4})-(\d{2})-(\d{2}))");
// Rearrange date format: YYYY-MM-DD -> DD/MM/YYYY
std::string result = std::regex_replace(text, pattern, "$3/$2/$1");
std::cout << result << '\n'; // "04/12/2024"
// Swap first and last name
std::string name = "Doe, John";
std::string swapped = std::regex_replace(
name,
std::regex(R"((\w+), (\w+))"),
"$2 $1"
);
std::cout << swapped << '\n'; // "John Doe"
return 0;
}
Replacement Flags
#include <regex>
#include <string>
#include <iostream>
int main() {
std::string text = "foo FOO Foo";
std::regex pattern("foo", std::regex::icase);
// Default: Replace all
std::cout << std::regex_replace(text, pattern, "bar") << '\n';
// "bar bar bar"
// Replace only first
std::cout << std::regex_replace(
text, pattern, "bar",
std::regex_constants::format_first_only
) << '\n';
// "bar FOO Foo"
// Don't copy unmatched parts
std::cout << std::regex_replace(
text, pattern, "bar",
std::regex_constants::format_no_copy
) << '\n';
// "barbarbar" (only replacements)
return 0;
}
Regex Iterators
sregex_iterator
#include <regex>
#include <string>
#include <iostream>
int main() {
std::string text = "Error on line 10, warning on line 25, error on line 42";
std::regex pattern(R"((error|warning) on line (\d+))");
auto begin = std::sregex_iterator(text.begin(), text.end(), pattern);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
std::smatch match = *it;
std::cout << "Match: " << match.str() << '\n';
std::cout << " Type: " << match[1] << '\n';
std::cout << " Line: " << match[2] << '\n';
}
return 0;
}
sregex_token_iterator
#include <regex>
#include <string>
#include <iostream>
#include <vector>
int main() {
std::string text = "one,two,three,four";
std::regex delim(",");
// Split by delimiter
std::sregex_token_iterator begin(text.begin(), text.end(), delim, -1);
std::sregex_token_iterator end;
std::vector<std::string> tokens(begin, end);
for (const auto& token : tokens) {
std::cout << token << '\n';
}
// Extract only the delimiters
std::sregex_token_iterator delim_begin(text.begin(), text.end(), delim, 0);
for (auto it = delim_begin; it != end; ++it) {
std::cout << "Delimiter: " << *it << '\n';
}
// Extract specific groups
std::string dates = "2024-12-04, 2023-11-03";
std::regex date_pattern(R"((\d{4})-(\d{2})-(\d{2}))");
// Extract only years (group 1)
std::sregex_token_iterator year_it(dates.begin(), dates.end(), date_pattern, 1);
for (auto it = year_it; it != end; ++it) {
std::cout << "Year: " << *it << '\n';
}
return 0;
}
Common Use Cases
Email Validation
#include <regex>
#include <string>
bool is_valid_email(const std::string& email) {
// Simple email pattern
std::regex pattern(R"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})");
return std::regex_match(email, pattern);
}
int main() {
std::cout << is_valid_email("user@example.com") << '\n'; // 1 (true)
std::cout << is_valid_email("invalid.email") << '\n'; // 0 (false)
return 0;
}
URL Parsing
#include <regex>
#include <string>
#include <iostream>
struct URL {
std::string protocol;
std::string domain;
std::string port;
std::string path;
std::string query;
};
URL parse_url(const std::string& url) {
std::regex pattern(
R"(^(https?):\/\/([^:/]+)(?::(\d+))?(\/[^?]*)?\??(.*)$)"
);
std::smatch matches;
URL result;
if (std::regex_match(url, matches, pattern)) {
result.protocol = matches[1];
result.domain = matches[2];
result.port = matches[3];
result.path = matches[4];
result.query = matches[5];
}
return result;
}
int main() {
URL url = parse_url("https://example.com:8080/path/to/page?key=value");
std::cout << "Protocol: " << url.protocol << '\n';
std::cout << "Domain: " << url.domain << '\n';
std::cout << "Port: " << url.port << '\n';
std::cout << "Path: " << url.path << '\n';
std::cout << "Query: " << url.query << '\n';
return 0;
}
Log Parsing
#include <regex>
#include <string>
#include <fstream>
#include <iostream>
struct LogEntry {
std::string timestamp;
std::string level;
std::string message;
};
std::vector<LogEntry> parse_log(const std::string& log_text) {
std::vector<LogEntry> entries;
// Pattern: [YYYY-MM-DD HH:MM:SS] LEVEL: message
std::regex pattern(R"(\[([\d\-: ]+)\] (\w+): (.+))");
auto begin = std::sregex_iterator(log_text.begin(), log_text.end(), pattern);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
LogEntry entry;
entry.timestamp = (*it)[1];
entry.level = (*it)[2];
entry.message = (*it)[3];
entries.push_back(entry);
}
return entries;
}
Phone Number Formatting
#include <regex>
#include <string>
#include <iostream>
std::string format_phone_number(const std::string& phone) {
// Remove all non-digit characters
std::string digits = std::regex_replace(phone, std::regex(R"(\D)"), "");
// Format as (XXX) XXX-XXXX
std::regex pattern(R"((\d{3})(\d{3})(\d{4}))");
return std::regex_replace(digits, pattern, "($1) $2-$3");
}
int main() {
std::cout << format_phone_number("1234567890") << '\n';
// (123) 456-7890
std::cout << format_phone_number("123-456-7890") << '\n';
// (123) 456-7890
std::cout << format_phone_number("(123) 456-7890") << '\n';
// (123) 456-7890
return 0;
}
Performance Considerations
Regex Compilation Cost
┌────────────────────────────────────────────────────────┐
│ Regex Performance Tips │
├────────────────────────────────────────────────────────┤
│ │
│ 1. Compile Once, Use Many Times: │
│ BAD: for (...) { std::regex r("pattern"); ... } │
│ GOOD: std::regex r("pattern"); │
│ for (...) { std::regex_match(..., r); } │
│ │
│ 2. Use std::regex::optimize: │
│ std::regex r("pattern", std::regex::optimize); │
│ • Slower compilation, faster execution │
│ │
│ 3. Avoid Catastrophic Backtracking: │
│ BAD: (a+)+b │
│ GOOD: a+b │
│ │
│ 4. Use Non-Capturing Groups: │
│ (?:pattern) instead of (pattern) when not needed │
│ │
│ 5. Anchor Patterns: │
│ ^pattern$ is faster than pattern for full match │
│ │
│ 6. Consider Alternatives: │
│ • string::find() for simple substring search │
│ • Hand-written parser for complex cases │
│ │
└────────────────────────────────────────────────────────┘
Benchmarking Example
#include <regex>
#include <string>
#include <chrono>
#include <iostream>
void benchmark_regex() {
using namespace std::chrono;
std::string text = "test@example.com";
// Compile every time (SLOW)
{
auto start = steady_clock::now();
for (int i = 0; i < 10000; ++i) {
std::regex r(R"(\w+@\w+\.\w+)");
std::regex_match(text, r);
}
auto elapsed = duration_cast<milliseconds>(steady_clock::now() - start);
std::cout << "Compile each time: " << elapsed.count() << "ms\n";
}
// Compile once (FAST)
{
std::regex r(R"(\w+@\w+\.\w+)", std::regex::optimize);
auto start = steady_clock::now();
for (int i = 0; i < 10000; ++i) {
std::regex_match(text, r);
}
auto elapsed = duration_cast<milliseconds>(steady_clock::now() - start);
std::cout << "Compile once: " << elapsed.count() << "ms\n";
}
}
Error Handling
Catching Regex Errors
#include <regex>
#include <iostream>
int main() {
try {
// Invalid regex: unmatched parenthesis
std::regex invalid("(abc");
}
catch (const std::regex_error& e) {
std::cout << "Regex error: " << e.what() << '\n';
std::cout << "Error code: " << e.code() << '\n';
}
// Check error codes
try {
std::regex r("[z-a]"); // Invalid range
}
catch (const std::regex_error& e) {
if (e.code() == std::regex_constants::error_range) {
std::cout << "Invalid character range\n";
}
}
return 0;
}
Best Practices
1. Use Raw String Literals
// GOOD: Clear, no double backslashes
std::regex pattern(R"(\d+\.\d+)");
// BAD: Hard to read
std::regex pattern("\\d+\\.\\d+");
2. Document Complex Patterns
// GOOD: Documented regex
// Pattern: (year)-(month)-(day)
// Example: 2024-12-04
std::regex date_pattern(R"((\d{4})-(\d{2})-(\d{2}))");
const int YEAR = 1;
const int MONTH = 2;
const int DAY = 3;
3. Validate Input
bool is_safe_input(const std::string& input) {
// Check for suspicious patterns that might cause
// catastrophic backtracking
if (input.size() > 10000) return false;
// Add more checks as needed
return true;
}
4. Use Appropriate Matching Function
// regex_match: Entire string must match
// Use for: Validation
if (std::regex_match(email, email_pattern)) { /* valid email */ }
// regex_search: Find pattern anywhere
// Use for: Extraction
if (std::regex_search(text, match, pattern)) { /* found */ }
// regex_replace: Replace occurrences
// Use for: Transformation
auto result = std::regex_replace(text, pattern, replacement);
Common Pitfalls
1. Forgetting to Escape Special Characters
// BAD: . matches any character
std::regex pattern("example.com"); // Matches "exampleXcom"
// GOOD: \. matches literal dot
std::regex pattern(R"(example\.com)");
2. Catastrophic Backtracking
// BAD: Exponential time complexity
std::regex bad(R"((a+)+b)");
std::string input(30, 'a'); // No 'b' at end
// Takes forever to fail!
// GOOD: Linear time
std::regex good(R"(a+b)");
3. Not Using std::regex::optimize
// If using regex repeatedly
std::regex pattern("\\w+", std::regex::optimize);
Alternatives to Regex
When regex might not be the best choice:
// Simple substring search: Use string::find
if (text.find("substring") != std::string::npos) { /* ... */ }
// Simple prefix/suffix: Use string methods
if (text.starts_with("prefix")) { /* ... */ } // C++20
if (text.ends_with("suffix")) { /* ... */ } // C++20
// Tokenization: Use string_view or istringstream
// Complex parsing: Consider dedicated parser libraries
Complete Practical Example: Data Validator and Parser
Here’s a comprehensive example integrating regex matching, searching, replacing, and validation:
#include <iostream>
#include <regex>
#include <string>
#include <vector>
#include <map>
#include <fstream>
#include <sstream>
// Data validation class using multiple regex patterns
class DataValidator {
private:
// Compiled regex patterns (compile once, use many times)
std::regex email_pattern_{R"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"};
std::regex phone_pattern_{R"(\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})"};
std::regex url_pattern_{R"(https?://[^\s]+)"};
std::regex ip_pattern_{R"(\b(?:\d{1,3}\.){3}\d{1,3}\b)"};
std::regex date_pattern_{R"(\d{4}-\d{2}-\d{2})"};
std::regex credit_card_pattern_{R"(\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})"};
std::regex ssn_pattern_{R"(\d{3}-\d{2}-\d{4})"};
public:
// Email validation
bool is_valid_email(const std::string& email) const {
return std::regex_match(email, email_pattern_);
}
// Extract all emails from text
std::vector<std::string> extract_emails(const std::string& text) const {
std::vector<std::string> emails;
auto begin = std::sregex_iterator(text.begin(), text.end(), email_pattern_);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
emails.push_back(it->str());
}
return emails;
}
// Phone number validation and formatting
struct PhoneNumber {
std::string original;
std::string formatted;
bool valid;
};
PhoneNumber validate_phone(const std::string& phone) const {
PhoneNumber result{phone, "", false};
if (std::regex_match(phone, phone_pattern_)) {
result.valid = true;
// Extract digits only
std::string digits = std::regex_replace(phone,
std::regex(R"(\D)"), "");
// Format as (XXX) XXX-XXXX
if (digits.length() == 10) {
result.formatted = "(" + digits.substr(0, 3) + ") " +
digits.substr(3, 3) + "-" +
digits.substr(6, 4);
}
}
return result;
}
// URL extraction and validation
std::vector<std::string> extract_urls(const std::string& text) const {
std::vector<std::string> urls;
auto begin = std::sregex_iterator(text.begin(), text.end(), url_pattern_);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
urls.push_back(it->str());
}
return urls;
}
// Validate and parse date
struct Date {
int year, month, day;
bool valid;
};
Date parse_date(const std::string& date_str) const {
std::smatch matches;
Date result{0, 0, 0, false};
if (std::regex_match(date_str, matches, date_pattern_)) {
try {
result.year = std::stoi(date_str.substr(0, 4));
result.month = std::stoi(date_str.substr(5, 2));
result.day = std::stoi(date_str.substr(8, 2));
// Basic validation
if (result.year >= 1900 && result.year <= 2100 &&
result.month >= 1 && result.month <= 12 &&
result.day >= 1 && result.day <= 31) {
result.valid = true;
}
} catch (...) {
result.valid = false;
}
}
return result;
}
// Mask sensitive data (credit cards, SSN)
std::string mask_credit_cards(const std::string& text) const {
return std::regex_replace(text, credit_card_pattern_,
[](const std::smatch& match) {
std::string card = match.str();
// Remove all non-digits
card = std::regex_replace(card, std::regex(R"(\D)"), "");
// Mask all but last 4 digits
if (card.length() >= 4) {
return "**** **** **** " + card.substr(card.length() - 4);
}
return std::string("****");
});
}
std::string mask_ssn(const std::string& text) const {
return std::regex_replace(text, ssn_pattern_, "***-**-$1");
}
// Find IP addresses
std::vector<std::string> extract_ip_addresses(const std::string& text) const {
std::vector<std::string> ips;
auto begin = std::sregex_iterator(text.begin(), text.end(), ip_pattern_);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
ips.push_back(it->str());
}
return ips;
}
};
// Log parser using regex
class LogParser {
private:
// Pattern: [TIMESTAMP] [LEVEL] [SOURCE] Message
std::regex log_pattern_{R"(\[([^\]]+)\]\s*\[(\w+)\]\s*\[([^\]]+)\]\s*(.+))"};
public:
struct LogEntry {
std::string timestamp;
std::string level;
std::string source;
std::string message;
};
bool parse_log_line(const std::string& line, LogEntry& entry) const {
std::smatch matches;
if (std::regex_match(line, matches, log_pattern_)) {
entry.timestamp = matches[1];
entry.level = matches[2];
entry.source = matches[3];
entry.message = matches[4];
return true;
}
return false;
}
std::vector<LogEntry> parse_log_file(const std::string& filename) const {
std::vector<LogEntry> entries;
std::ifstream file(filename);
std::string line;
while (std::getline(file, line)) {
LogEntry entry;
if (parse_log_line(line, entry)) {
entries.push_back(entry);
}
}
return entries;
}
// Filter logs by level
std::vector<LogEntry> filter_by_level(
const std::vector<LogEntry>& entries,
const std::string& level) const {
std::vector<LogEntry> filtered;
for (const auto& entry : entries) {
if (entry.level == level) {
filtered.push_back(entry);
}
}
return filtered;
}
};
// Text sanitizer/cleaner
class TextSanitizer {
public:
// Remove HTML tags
std::string remove_html_tags(const std::string& text) const {
return std::regex_replace(text, std::regex(R"(<[^>]*>)"), "");
}
// Normalize whitespace
std::string normalize_whitespace(const std::string& text) const {
// Replace multiple spaces with single space
std::string result = std::regex_replace(text,
std::regex(R"(\s+)"), " ");
// Trim leading/trailing spaces
result = std::regex_replace(result, std::regex(R"(^\s+|\s+$)"), "");
return result;
}
// Extract hashtags
std::vector<std::string> extract_hashtags(const std::string& text) const {
std::vector<std::string> hashtags;
std::regex pattern(R"(#(\w+))");
auto begin = std::sregex_iterator(text.begin(), text.end(), pattern);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
hashtags.push_back((*it)[1].str()); // Group 1 is the tag without #
}
return hashtags;
}
// Extract mentions
std::vector<std::string> extract_mentions(const std::string& text) const {
std::vector<std::string> mentions;
std::regex pattern(R"(@(\w+))");
auto begin = std::sregex_iterator(text.begin(), text.end(), pattern);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
mentions.push_back((*it)[1].str());
}
return mentions;
}
// Convert URLs to links
std::string linkify_urls(const std::string& text) const {
std::regex url_pattern(R"((https?://[^\s]+))");
return std::regex_replace(text, url_pattern, "<a href=\"$1\">$1</a>");
}
};
// CSV parser using regex
class CSVParser {
private:
std::regex csv_pattern_{R"((?:^|,)(?:"([^"]*)"|([^,]*)))"};
public:
std::vector<std::string> parse_line(const std::string& line) const {
std::vector<std::string> fields;
auto begin = std::sregex_iterator(line.begin(), line.end(), csv_pattern_);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
// Get either quoted (group 1) or unquoted (group 2) field
std::string field = (*it)[1].matched ? (*it)[1].str() : (*it)[2].str();
fields.push_back(field);
}
return fields;
}
};
int main() {
std::cout << "=== Data Validator and Parser Demo ===\n\n";
// 1. Email validation and extraction
std::cout << "--- Email Validation ---\n";
DataValidator validator;
std::string text1 = "Contact us at info@example.com or support@test.org";
auto emails = validator.extract_emails(text1);
std::cout << "Found " << emails.size() << " emails:\n";
for (const auto& email : emails) {
std::cout << " " << email
<< " (" << (validator.is_valid_email(email) ? "valid" : "invalid")
<< ")\n";
}
// 2. Phone number validation and formatting
std::cout << "\n--- Phone Number Validation ---\n";
std::vector<std::string> phones = {
"555-123-4567",
"(555) 123-4567",
"5551234567",
"invalid"
};
for (const auto& phone : phones) {
auto result = validator.validate_phone(phone);
std::cout << phone << " -> ";
if (result.valid) {
std::cout << result.formatted << " (valid)\n";
} else {
std::cout << "invalid\n";
}
}
// 3. URL extraction
std::cout << "\n--- URL Extraction ---\n";
std::string text2 = "Visit https://example.com or http://test.org for more info";
auto urls = validator.extract_urls(text2);
std::cout << "Found " << urls.size() << " URLs:\n";
for (const auto& url : urls) {
std::cout << " " << url << "\n";
}
// 4. Date parsing
std::cout << "\n--- Date Parsing ---\n";
std::vector<std::string> dates = {"2024-12-04", "2024-13-01", "invalid"};
for (const auto& date_str : dates) {
auto date = validator.parse_date(date_str);
std::cout << date_str << " -> ";
if (date.valid) {
std::cout << "Year: " << date.year << ", Month: " << date.month
<< ", Day: " << date.day << "\n";
} else {
std::cout << "invalid\n";
}
}
// 5. Masking sensitive data
std::cout << "\n--- Data Masking ---\n";
std::string sensitive = "Card: 1234-5678-9012-3456, SSN: 123-45-6789";
std::cout << "Original: " << sensitive << "\n";
std::cout << "Masked: " << validator.mask_credit_cards(sensitive) << "\n";
// 6. Text sanitization
std::cout << "\n--- Text Sanitization ---\n";
TextSanitizer sanitizer;
std::string html = "<p>Hello <b>World</b>!</p>";
std::cout << "HTML: " << html << "\n";
std::cout << "Clean: " << sanitizer.remove_html_tags(html) << "\n";
std::string messy = " Too many spaces ";
std::cout << "Messy: '" << messy << "'\n";
std::cout << "Clean: '" << sanitizer.normalize_whitespace(messy) << "'\n";
// 7. Social media parsing
std::cout << "\n--- Social Media Parsing ---\n";
std::string tweet = "Check out #cpp #programming! Thanks @john and @jane";
auto hashtags = sanitizer.extract_hashtags(tweet);
std::cout << "Hashtags: ";
for (const auto& tag : hashtags) {
std::cout << "#" << tag << " ";
}
std::cout << "\n";
auto mentions = sanitizer.extract_mentions(tweet);
std::cout << "Mentions: ";
for (const auto& mention : mentions) {
std::cout << "@" << mention << " ";
}
std::cout << "\n";
// 8. CSV parsing
std::cout << "\n--- CSV Parsing ---\n";
CSVParser csv_parser;
std::string csv_line = R"(John,"Doe, Jr.",30,"New York")";
auto fields = csv_parser.parse_line(csv_line);
std::cout << "CSV line: " << csv_line << "\n";
std::cout << "Parsed fields:\n";
for (size_t i = 0; i < fields.size(); ++i) {
std::cout << " Field " << i << ": '" << fields[i] << "'\n";
}
std::cout << "\n=== Demo Complete ===\n";
return 0;
}
Concepts Demonstrated:
- regex_match: Full string validation
- regex_search: Finding patterns in text
- regex_replace: Substitution and transformation
- regex_iterator: Finding all occurrences
- Capture groups: Extracting sub-patterns
- Raw string literals:
R"(...)"for patterns - Compiled regex: Compile once, use many times
- Phone number formatting: Extracting and reformatting
- Data masking: Security and privacy
- HTML/text cleaning: Removing tags, normalizing
- CSV parsing: Handling quoted fields
- Social media parsing: Hashtags and mentions
- Multiple patterns: Different regex for different data
This example shows real-world regex applications for data validation and parsing!
Next Steps
- Next: Modules →
- Previous: ← Memory Management
Part 20 of 22 - Regular Expressions