SITE-A : Programming Lightweight Parser for Tagged Document

#### A lightweight module to load something looks like XML ####

A document that has tags in it is just cool.
With that feature we can easily deal with various types of data structure. But thanks to the prestigious name 'XML', we have to care of many many undesireble but allegedly convenient regulations.
That's too much troube, isn't it?

DOCTYPE declaration? Who cares?
DTD? Who knows?

Listen. What I wanna do is just to load a document that has tags and use it in my promising program! Get out of my way!

Surely, you think so, too, don't you?

The source code to load a document that has tags

XML? I got tired of that name.
I'm just going to load a document that has tags.

- Header File [NotXML.h] -

				//=========================================================
				//
				// 簡易XMLパーサークラス（宣言）
				//
				//=========================================================
				
				#ifndef _NOTXML_
				#define _NOTXML_
				
				#include <vector>
				#include <map>
				#include <string>
				
				using namespace std;
				
				
				class NotXML {
				
				public:
				
					//=========================================================
					// タグ構造体
					//=========================================================
					struct Tag {
				
						string name;                   // タグ名
						string value;                  // 値
						map<string, string> attribute; // 属性
						vector<Tag> childList;         // 子要素
				
						Tag() {
							name  = "";
							value = "";
						}
					};
					
					//=========================================================
					// 読み込む
					//=========================================================
					vector<Tag> Read     (string fileName);
					Tag         ReadRoot (string fileName);
				
					//=========================================================
					// Utilities
					//=========================================================
					static Tag GetTagFromName( const Tag& parent, string name ) {
				
						Tag result;
				
						for (unsigned int i = 0 ; i < parent.childList.size(); ++i) {
				
							if ( parent.childList[i].name == name) {
								result = parent.childList[i];
								break;
							}
						}
				
						return result;
					}
					static vector<Tag> GetTagListFromName( const Tag& parent, string name ) {
				
						vector<Tag> result;
				
						for (unsigned int i = 0 ; i < parent.childList.size(); ++i) {
				
							if ( parent.childList[i].name == name) {
								result.push_back( parent.childList[i] );
							}
						}
				
						return result;
					}
					static string GetAttribute( const Tag& tag, string name, string defaultValue = "" ) {
				
						string result = defaultValue;
				
						if (tag.attribute.find(name) != tag.attribute.end()) {
							result = tag.attribute.find(name)->second;
						}
				
						return result;
					}
				
				private:
				
					///// タグの種類（読み取り中専用）
					enum PURE_TAG_TYPE {
				
						NOT_PURE_TAG_TYPE,     // 無効値
						OPEN,    // 開始タグ    <***>
						CLOSE,   // 閉じタグ    </***>
						EMPTY,   // 空要素タグ  <***/>
						COMMENT, // コメント    <!-- ***  -->
				
						PROCESS, // 処理命令    <?***?> Processing Instruction
						DOCTYPE, // DOCTYPE宣言 <!DOCTYPE ****/>
				
						VALUE    // 値          上記以外（タグではない）
					};
				
				
					//=========================================================
					// ファイルを読み込んで文字列にする
					//=========================================================
					string ReadFile_toString(string fileName);
				
					//=========================================================
					// 文字列からタグっぽい箇所を抽出
					//=========================================================
					vector<string> Parse_PureTagList(string str);
				
					//=========================================================
					// タグの種類を判別する
					//=========================================================
					PURE_TAG_TYPE JadgePureTag( string pureTag );
				
					//=========================================================
					// 属性を解釈
					//=========================================================
					map<string, string> GetAttribute(string str);
				
					//=========================================================
					// タグとしての情報を取得する
					//=========================================================
					Tag GetTagInfo(string str, PURE_TAG_TYPE type);
				
					//=========================================================
					// タグ間の構造を解釈
					//=========================================================
					vector<Tag> Parse(vector<string> pureTagList);
				
					//=========================================================
					// 前後Trim
					//【引数】
					// string
					//【戻り値】
					// 前後の半角スペース、\n、\r、\t を取り除いた文字列
					//=========================================================
					string Trim(const string &str);
				};
				#endif

- Implementation File [NotXML.cpp] -

				//=========================================================
				//
				// 簡易XMLパーサークラス（実装）
				//
				//=========================================================
				
				#include "NotXML.h"
				
				#include <stdio.h>
				
				#include <fstream>
				
				#include <vector>
				#include <stack>
				
				using namespace std;
				
				
				//=========================================================
				// 読み込む
				//=========================================================
				vector<NotXML::Tag> NotXML::Read(string fileName) {
				
					// 戻り値となる変数を用意
					vector<Tag> result;
				
					// 指定されたファイルを読み込んで、中身を string型で持つ
					string fileContent = ReadFile_toString( fileName );
				
					// 「<」「>」で囲われた要素のリストを取り出す
					vector<string> pureTagList = Parse_PureTagList(fileContent);
				
					// タグの開始・閉じ、入れ子構造などを読み取って出来上がり
					result = Parse(pureTagList);
				
					// 召し上がれー
					return result;
				
				}
				//=========================================================
				// 読み込む
				// 最上位要素を一個だけ
				// ルート要素を持たせている場合はこっちを呼ぶと楽かも。
				//=========================================================
				NotXML::Tag NotXML::ReadRoot(string fileName) {
				
					/*
						Read() に丸投げして、結果から一番最初の要素を取り出しているだけです。
					*/
				
					Tag result;
				
					vector<Tag> rootList = Read(fileName);
				
					if (rootList.size() > 0) {
						result = rootList[0];
					}
				
					return result;
				}
				
				
				
				
				
				//=========================================================
				// ファイルを読み込んで文字列にする
				//=========================================================
				string NotXML::ReadFile_toString(string fileName) {
				
					string result = "";
					bool isFirstRow = true;
				
					std::ifstream ifs;
					
					ifs.open( fileName.c_str() );
					
					string row;
					while ( getline(ifs, row) ) {
				
						if (!isFirstRow) {
							result += '\n';
						}
				
						result += row;
				
						isFirstRow = false;
					}
				
					ifs.close();
				
					return result;
				}
				
				//=========================================================
				// 文字列からタグっぽい箇所を抽出
				//=========================================================
				vector<string> NotXML::Parse_PureTagList(string str) {
				
					vector<string> result;
				
					string temp = "";
				
					for (unsigned int i = 0; i < str.size(); ++i) {
				
						char c = str[i];
				
						// 別のタグが始まるならば
						if (c == '<') {
				
							// 現在の状態で保存
							if (temp != "") {
								result.push_back(temp);
								temp = "";
							}
						}
				
						temp += c;
				
						// 閉じましたか？
						if (c == '>') {
				
							// 現在の状態で保存
							result.push_back(temp);
							temp = "";
						}
				
						/*
							ファイル末尾に余った文字列は無視されるのですね。
							どのタグからも閉じられていない値ということで。
						*/
					}
				
				
				
					return result;
				}
				
				//=========================================================
				// タグの種類を判別する
				//=========================================================
				NotXML::PURE_TAG_TYPE NotXML::JadgePureTag( string pureTag ) {
				
					PURE_TAG_TYPE type = NOT_PURE_TAG_TYPE;
				
					/*
						・開始タグ    ：開始が「<」   、末尾が「>」である。
						・閉じタグ    ：開始が「</」  、末尾が「>」である。
						・自己完結タグ：開始が「<」   、末尾が「/>」である。
						・コメント    ：開始が「<!--」、末尾が「-->」である。
						
						・処理命令    ：開始が「<?」       、末尾が「?>」である。
						・DOCTYPE宣言 ：開始が「<!DOCTYPE」、末尾が「/>」である。
				
						・値          ：「<」～「>」ではない生の文字列
				
						・なんでもない：開始が「</」  、末尾が「/>」である
					*/
					do {
						///// コメント？
						string commentStart = pureTag.substr(0, 4);
						string commentEnd   = (pureTag.size() < 3) ? "" : pureTag.substr( pureTag.size() - 3);
						if (commentStart == "<!--" && commentEnd == "-->") {
							type = COMMENT;
							break;
						}
				
						string closeTagStart  = pureTag.substr(0, 2);
						string nonValueTagEnd = (pureTag.size() < 2) ? "" : pureTag.substr( pureTag.size() - 2);
				
						///// こんなのはおかしい！
						if (closeTagStart == "</" && nonValueTagEnd == "/>") {
							type = NOT_PURE_TAG_TYPE;
							break;
						}
				
						string processStart = pureTag.substr(0, 2);
						string processEnd   = (pureTag.size() < 2) ? "" : pureTag.substr( pureTag.size() - 2);
				
						///// 処理命令？
						if (processStart == "<?" && processEnd == "?>") {
							type = PROCESS;
							break;
						}
				
						string docTypeStart = pureTag.substr(0, 9);
				
						///// DOCTYPE宣言？
						if (docTypeStart == "<!DOCTYPE" && nonValueTagEnd == "/>") {
							type = DOCTYPE;
							break;
						}
				
				
						string nomalTagStart = pureTag.substr(0, 1);
						string nomalTagEnd   = (pureTag.size() < 1) ? "" : pureTag.substr( pureTag.size() - 1);
				
						///// 自己完結タグ？
						if (nomalTagStart == "<" && nonValueTagEnd == "/>") {
							type = EMPTY;
							break;
						}
				
						///// 閉じタグ？
						if (closeTagStart == "</" && nomalTagEnd == ">") {
							type = CLOSE;
							break;
						}
				
						///// 開始タグ？
						if (nomalTagStart == "<" && nomalTagEnd == ">") {
							type = OPEN;
							break;
						}
				
						///// 値？
						if (type == NOT_PURE_TAG_TYPE) {
							type = VALUE;
						}
				
					} while(false);
				
					//printf("%s：%d\n", pureTag.c_str(), type);
				
					return type;
				}
				
				
				//=========================================================
				// 属性を解釈
				//=========================================================
				map<string, string> NotXML::GetAttribute(string str) {
				
					map<string, string> result;
				
					// 現在、何を読み取っている最中なのか？
					string currentParseMode;
					string PARSE_MODE_KEY   = "key";
					string PARSE_MOSE_VALUE = "value";
					currentParseMode = PARSE_MODE_KEY;
				
					unsigned int index = 0;
					string tempKey   = "";
					string tempValue = "";
				
					bool isStartedAddValue = false;
				
					while ( index < str.size() ) {
				
						///// キーを読み取っているところです。
						if ( currentParseMode == PARSE_MODE_KEY ) {
						
							// 「=」が出たら、キー終了
							if (str[index] == '=') {
				
								currentParseMode = PARSE_MOSE_VALUE;
								++index;
								continue;
							}
				
							// キーに文字を追加していく
							tempKey += str[index];
				
						
						///// 値を読み取っているところです
						} else if ( currentParseMode == PARSE_MOSE_VALUE ) {
						
							// まだ値に文字を追加し始めていない
							if (!isStartedAddValue) {
				
								// 空白じゃないものが出た
								if ( !isspace( str[index])) {
				
									isStartedAddValue = true;
								}
							}
				
				
							// 値に文字を追加し始めているところ
							if (isStartedAddValue) {
						
								// 終了判定の微妙さ。現在の文字は追加していいの？
								bool isAddCurrentChar = true;
				
								// クォーテーションで囲んで終了した？
								bool isEndByQuort = false;
				
								// 終了判定
								if ( tempValue.size() > 0 ) {
				
									// 値の最初の文字が クォーテーションである場合
									if ( tempValue[0] == '"' || tempValue[0] == '\'' ) {
				
										// クォーテーションが現れたら、値終了
										if ( str[index] == '"' || str[index] == '\"' ) {
				
											// エスケープされているか？
											bool isEscape = false;
											if ( tempValue.size() > 1 && tempValue[ tempValue.size() - 1] == '\\' ) {
				
												isEscape = true;
				
												// しかし「\」記号がエスケープされていたら判定は覆る。
												if ( tempValue.size() > 2 && tempValue[ tempValue.size() - 2] == '\\' ) {
													isEscape = false;
												} 
											}
				
											// エスケープされてないなら値終了
											if (!isEscape) {
				
												currentParseMode  = PARSE_MODE_KEY;
												isStartedAddValue = false;
												isAddCurrentChar  = true;
				
												isEndByQuort = true;
											}
										}
				
									// 値の最初の文字が クォーテーションではない場合
									} else {
				
										// 空白が現れたら終了
										if ( isspace( str[index] )  ) {
											currentParseMode  = PARSE_MODE_KEY;
											isStartedAddValue = false;
											isAddCurrentChar  = false;
										}
				
										// 最後まで達していても終了
										if ( index == str.size() - 1 ) {
											currentParseMode  = PARSE_MODE_KEY;
											isStartedAddValue = false;
											isAddCurrentChar  = true;
										}
				
									}
				
								} // if 値読み込み 終了判定
				
								// 値に文字を追加していく
								if (isAddCurrentChar) {
									tempValue += str[index];
								}
				
								// 値読み込みが終了している場合
								if (currentParseMode != PARSE_MOSE_VALUE) {
				
									// 値が空白じゃないなら、加工する
									if (tempValue != "") {
				
										// 値の前後のクォーテーションを外す
										if ( isEndByQuort) {
											tempValue = tempValue.substr( 1, tempValue.size() - 2 );
				
				
											// エスケープ記号を外す
											/*
												\" → "
												\' → '
												\\ → \
				
												\\" → ？？？
												\\\ → ？？？
				
												あまり中途半端に気を利かさない方がいい？
												そんな気がしてきました。
												よし。処理しません。
											*/
										}
				
									}
									
									// キーの前後はトリムする
									tempKey = Trim( tempKey );
				
				
									// キーと値のペアを属性に追加する
									string key   = tempKey;
									string value = tempValue;
				
									result.insert( pair<string, string>( key, value ) );
				
									tempKey   = "";
									tempValue = "";
								}
				
							} // if 値に文字を追加し始めているところ
						}
				
				
						++index;
					};
				
				
				
					return result;
				}
				
				//=========================================================
				// タグとしての情報を取得する
				//=========================================================
				NotXML::Tag NotXML::GetTagInfo(string str, PURE_TAG_TYPE type) {
				
					Tag result;
				
					///// 「<」「>」を外した中身部分
					string content = "";
				
					// 開始タグです。<***>
					if        ( type == OPEN ) {
				
						content = str.substr(1, str.size() - 2);
				
					// 閉じタグです。</***>
					} else if ( type == CLOSE ) {
				
						content = str.substr(2, str.size() - 3);
				
					// 空要素タグです。<***/>
					} else if ( type == EMPTY ) {
				
						content = str.substr(1, str.size() - 3);
				
					// 処理命令タグです。<?***?>
					} else if ( type == PROCESS ) {
				
						content = str.substr(2, str.size() - 4);
				
					// DOCTYPE宣言です。<!DOCTYPE ***/>
					} else if ( type == DOCTYPE ) {
				
						content = str.substr(9, str.size() - 11);
				
						///// DOCTYPEの場合は特殊。
						result.name  = "";
						result.value = content;
						return result;
					}
				
					// タグ名終了時点のインデックス
					int tagNameEndIndex = -1;
				
					// タグ名を取り出す
					for (unsigned int i = 0; i < content.size(); ++i) {
				
						// 空白じゃないなら、タグ名として文字を追加していく
						if (!isspace( content[i] )) {
							result.name += content[i];
				
						// 空白が出た場合
						} else {
				
							// すでにタグ名を認識し始めていたなら、終了
							if (result.name != "") {
				
								// タグ名終了時点のインデックスを覚えておく。
								tagNameEndIndex = i;
				
								// 終了
								break;
				
							// まだタグ名を認識し始めていないなら、続行
							} else {
								// 続行
							}
						}
					}
				
					// タグ名以降の部分から属性を取り出す
					string attributeString = (tagNameEndIndex > 0) ?  content.substr( tagNameEndIndex ) : "";
				
					result.attribute = GetAttribute(attributeString);
				
				
					return result;
				}
				
				//=========================================================
				// タグ間の構造を解釈
				//=========================================================
				vector<NotXML::Tag> NotXML::Parse(vector<string> pureTagList) {
				
					///// 根元タグリスト
					vector<Tag> rootTagList;
				
					///// 開き中のタグ
					stack<Tag> tagStack;
				
					for (unsigned int i = 0; i < pureTagList.size(); ++i) {
				
						string pureTag = pureTagList[i];
				
				
						// 分類する
						PURE_TAG_TYPE type =  JadgePureTag( pureTag );
				
				
						// 開始タグでした
						if        ( type == OPEN ) {
				
							// タグの名前とかを取得して、
							Tag tag =  GetTagInfo( pureTag, type);
				
							// スタックに乗せる
							tagStack.push( tag );
				
				
						// 閉じタグでした
						} else if ( type == CLOSE ) {
				
							// タグの名前とかを取得
							Tag tag =  GetTagInfo( pureTag, type);
				
							// 開き中のタグがありますよね？
							if (tagStack.size() > 0 ) {
				
								// 開き中のタグと、名前が一致してますよね？
								if ( tagStack.top().name == tag.name ) {
				
									// タグを取り出して、閉じる。
									Tag closedTag = tagStack.top();
									tagStack.pop();
				
									// そして、開き中のタグがありますか？
									if ( tagStack.size() > 0 ) {
				
										// 開き中のタグの子要素として追加する
										tagStack.top().childList.push_back( closedTag );
				
									} else {
				
										// 最上位タグのリストに追加する
										rootTagList.push_back( closedTag );
									}
				
								// え、一致してないの？ それは困りましたね。
								} else {
									printf("閉じタグに対して、開き中のタグがない：%s\n", pureTag.c_str());
								}
				
				
							// え、ないの？ それは困りましたね。
							} else {
								printf("閉じタグに対して、開き中のタグがない：%s\n", pureTag.c_str());
							}
				
				
						// 空要素タグでした
						} else if ( type == EMPTY ) {
				
							// タグの名前とかを取得して、
							Tag tag =  GetTagInfo( pureTag, type);
				
							// 一応念を押して要素を空にしておく。
							tag.value = "";
				
							// 開き中のタグがありますか？
							if ( tagStack.size() > 0 ) {
				
								// 開いているタグの子要素として追加する
								tagStack.top().childList.push_back( tag );
				
							// 開き中のタグがないならば、
							} else {
				
								// 最上位タグのリストに追加する
								rootTagList.push_back( tag );
							}
				
						// 値でした
						} else if ( type == VALUE ) {
				
							// 開き中のタグがあるはずなのです。
							if ( tagStack.size() > 0 ) {
				
								// この次には閉じタグが来るはず…
								tagStack.top().value += pureTag;
				
							// え、開き中のタグがないの？
							} else {
								//printf("開き中のタグがない・値：%s\n", pureTag.c_str());
							}
				
				
						// コメントでした
						} else if ( type == COMMENT ) {
				
							// 無視
				
						// 処理命令でした
						} else if ( type == PROCESS ) {
				
							// 無視してしまいます。
				
						// DOCTYPE宣言でした
						} else if ( type == DOCTYPE ) {
				
							// 無視してしまいます。
				
				
						// 無効値でした
						} else if ( type == NOT_PURE_TAG_TYPE ) {
				
							// 無視
				
						}
					}
				
					///// 根元タグリスト
					rootTagList;
				
					///// 開き中のタグ
					tagStack;
				
				
					return rootTagList;
				}
				
				
				
				//=========================================================
				// Trim
				//【引数】
				// string
				//【戻り値】
				// 前後のホワイトスペースを取り除いた文字列
				//=========================================================
				string NotXML::Trim(const string &str){
				
					if (str == "") {
						return "";
					}
				
					string result = "";
				
					unsigned int firstIndex = 0;
					unsigned int lastIndex = str.length() - 1;
				
					// 開始位置を見つける
					while (true) {
				
						// 全部空白
						if ( firstIndex >= str.length() ) {
							return "";
						}
				
						// こんなんでいいの？
						if( str[firstIndex] == ' ' || str[firstIndex] == '\r' || str[firstIndex] == '\n' || str[firstIndex] == '\t'){
							++firstIndex;
						} else {
							break;
						}
					}
				
					// 終了位置を見つける
					while (true) {
				
						// 再びこんなんでいいの？
						if( lastIndex > 0 && str[lastIndex] == ' ' || str[lastIndex] == '\r' || str[lastIndex] == '\n' || str[lastIndex] == '\t'){
							--lastIndex;
						} else {
							break;
						}
					}
				
					// 開始位置から終了位置までを取り出す
					for ( unsigned int i = firstIndex; i <= lastIndex; ++i ) {
						result += str[i];
					}
				
					return result;
				}

You can copy the source code above and mingle it with your progressing project. And of course you can modify it as much as you like, if you need. This source code is in the pulic domain.

By the way, as you can see that was written in our beloved C++.

Example

				#include "NotXML.h"
				
				int main () {
				
					///// Loading a text file
					vector <NotXML::Tag> tagList = NotXML().Read( "yourStructuredDocumentFile.txt" );
				
					for ( unsigned int i = 0; i < tagList.size(); ++i ) {
				
						// Is the tag's name "TAG_A" ?
						if ( tagList[i].name == "TAG_A") {
							
							printf ("Value : %s\n", tagList[i].value.c_str() );
							
						}
						// Is the tag's name "TAG_B" ?
						else if (tagList[i].name == "TAG_B") {
						
							printf ("Value : %s\n", tagList[i].value.c_str() );
							
						}
					}
					
					return 0;
				}

It generates a list of Tags from the file you gave to the function, The Tag structure has information for a tag in it. And it includes another Tag structure in it so that you can even load nested structure.