feat: JSON reading

- Implemented a tokenizer for json - Implemented a method which will read json from a string using the tokenizer
author: Dylan <boss@tehbox.org> 2026-06-06 19:36:41 +1200
committer: Dylan <boss@tehbox.org> 2026-06-06 19:36:41 +1200
commit: 1d379a5cf34475f66f2ab9359f77dac162c0a40e (patch)
tree: 395d54815331fbcad053001bf5cb28fafb69e9d4 /src
parent: 46c896bcd78d31130321562b0659e28230261b8e (diff)
download: tehjson-1d379a5cf34475f66f2ab9359f77dac162c0a40e.tar.gz
tehjson-1d379a5cf34475f66f2ab9359f77dac162c0a40e.zip
4 files changed, 212 insertions, 2 deletions
diff --git a/src/json.cpp b/src/json.cpp
index c11ca6e..9c1c3f9 100644
--- a/src/json.cpp
+++ b/src/json.cpp
@@ -1,7 +1,10 @@
 #include "json.h"
+#include "tokenizer.h"
 
 #include <cstddef>
+#include <iostream>
 #include <stdexcept>
+#include <string>
 
 namespace TehJSON
 {
@@ -66,4 +69,62 @@ namespace TehJSON
 			throw std::runtime_error("Node is a leaf!");
 		return children.size();
 	}
+
+	Token JSON::consume()
+	{
+		if(tokenPos >= tokens.size())
+			throw std::out_of_range("No tokens left, but json not finished!");
+		return tokens[tokenPos++];
+	}
+
+	Token JSON::consume(TokenType type)
+	{
+		Token t = consume();
+		if(t.type != type)
+			throw std::runtime_error("Wrong token type, expected: " + getTokenName(type) + ", but got: " + getTokenName(t.type));
+		return t;
+	}
+	
+	TokenType JSON::nextTokenType()
+	{
+		if(tokenPos >= tokens.size())
+			throw std::out_of_range("No tokens left, but json not finished!");
+		return tokens[tokenPos].type;
+	}
+
+	void JSON::readFromString(std::string s)
+	{
+		Tokenizer tokenizer;
+		tokenizer.appendInput(s);
+		std::vector<Token> stringTokens = tokenizer.tokenize();
+
+		readFromTokens(stringTokens, 0);
+	}
+
+	int JSON::readFromTokens(std::vector<Token> tokens, int pos)
+	{
+		this->tokens = tokens;
+		tokenPos = pos;
+		
+		consume(TokenType::LBrace);
+		while(nextTokenType() != TokenType::RBrace)
+		{
+			Token childName = consume(TokenType::StringLit);
+			// std::cout << "Child: " << childName.content << std::endl;
+			consume(TokenType::Colon);
+			switch(nextTokenType())
+			{
+			case TokenType::LBrace: tokenPos = children[childName.content].readFromTokens(tokens, tokenPos); break;
+			case TokenType::StringLit: children[childName.content].set<std::string>(consume(TokenType::StringLit).content); break;
+			case TokenType::IntLit: children[childName.content].set<int>(std::stoi(consume(TokenType::IntLit).content)); break;
+			case TokenType::FloatLit: children[childName.content].set<float>(std::stof(consume(TokenType::FloatLit).content)); break;
+			default: throw std::runtime_error("Token type is not a literal!");
+			}
+			if(nextTokenType() != TokenType::RBrace)
+				consume(TokenType::Comma);
+		}
+		consume(TokenType::RBrace);
+
+		return tokenPos;
+	}
 }
diff --git a/src/json.h b/src/json.h
index d1d7938..efde542 100644
--- a/src/json.h
+++ b/src/json.h
@@ -1,7 +1,12 @@
+#pragma once
+
+#include "tokenizer.h"
+
 #include <cstddef>
 #include <map>
 #include <memory>
 #include <string>
+#include <vector>
 
 namespace TehJSON
 {
@@ -12,14 +17,19 @@ namespace TehJSON
 		JSON(const JSON& other) = default;
 		~JSON();
 
+		// Writing methods
+		std::string getSerialized();
+		std::string _getSerialized(int currIndent);
+
+		// Reading methods
+		void readFromString(std::string s);
+
 		// Leaf methods
 		template <typename T>
 		T& get();
 		template <typename T>
 		void set(T value);
 		std::string leafType();
-		std::string getSerialized();
-		std::string _getSerialized(int currIndent);
 		template <typename T>
 		static std::string serializeData(std::shared_ptr<void> data);
 
@@ -30,6 +40,14 @@ namespace TehJSON
 	private:
 		bool isLeaf = false;
 
+		// Reading data fields
+		std::vector<Token> tokens;
+		int tokenPos = 0;
+		Token consume();
+		Token consume(TokenType type);
+		TokenType nextTokenType();
+		int readFromTokens(std::vector<Token> tokens, int pos);
+
 		// Leaf data fields
 		std::shared_ptr<void> data;
 		std::string (*dataSerializer)(std::shared_ptr<void>);
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
new file mode 100644
index 0000000..ac7c215
--- /dev/null
+++ b/src/tokenizer.cpp
@@ -0,0 +1,95 @@
+#include "tokenizer.h"
+#include <stdexcept>
+
+namespace TehJSON
+{
+	std::string getTokenName(TokenType t)
+	{
+		switch(t)
+		{
+		case TokenType::LBrace: return "LBrace";
+		case TokenType::RBrace: return "RBrace";
+		case TokenType::Colon: return "Colon";
+		case TokenType::Comma: return "Comma";
+		case TokenType::StringLit: return "StringLit";
+		case TokenType::IntLit: return "IntLit";
+		case TokenType::FloatLit: return "FloatLit";
+		}
+	}
+	
+	void Tokenizer::appendInput(std::string s)
+	{
+		input += s;
+	}
+
+	std::vector<Token> Tokenizer::tokenize()
+	{
+		std::vector<Token> tokens;
+
+		int pos = 0;
+		while(pos < input.size())
+		{
+			char c = input.at(pos);
+			switch(c)
+			{
+			case '\t':
+			case '\n':
+			case ' ': break;
+			case '"':
+			{
+				pos++;
+				if(pos >= input.size())
+					throw std::out_of_range("String literal never ends!");
+				std::string literalContent = "";
+				c = input.at(pos);
+				while(c != '"')
+				{
+					literalContent += std::string{c};
+					pos++;
+					if(pos >= input.size())
+						throw std::out_of_range("String literal never ends!");
+					c = input.at(pos);
+				}
+				tokens.push_back({TokenType::StringLit, literalContent});
+				break;
+			}
+			case '0' ... '9': {
+				std::string literalContent{c};
+				pos++;
+				c = input.at(pos);
+				bool isInt = true;
+				while((c >= '0' && c <= '9') || (c == '.'))
+				{
+					if(c == '.')
+					{
+						if(!isInt)
+							throw std::runtime_error("Cannot have multiple decimal places in float");
+						isInt = false;
+					}
+					literalContent += std::string{c};
+					pos++;
+					if(pos >= input.size())
+						throw std::out_of_range("Number literal never ends!");
+					c = input.at(pos);
+				}
+				pos--;
+				tokens.push_back({isInt?TokenType::IntLit:TokenType::FloatLit, literalContent});
+				break;
+			}
+			case '{': tokens.push_back({TokenType::LBrace, std::string{c}}); break;
+			case '}': tokens.push_back({TokenType::RBrace, std::string{c}}); break;
+			case ':': tokens.push_back({TokenType::Colon, std::string{c}}); break;
+			case ',': tokens.push_back({TokenType::Comma, std::string{c}}); break;
+			default: throw std::runtime_error("Unexpected character: '" + std::string{c} + "'");
+			}
+			pos++;
+		}
+
+		return tokens;
+	}
+
+	std::string Tokenizer::getInput()
+	{
+		return input;
+	}
+}
diff --git a/src/tokenizer.h b/src/tokenizer.h
new file mode 100644
index 0000000..3322553
--- /dev/null
+++ b/src/tokenizer.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace TehJSON
+{
+	enum struct TokenType
+	{
+		LBrace,
+		RBrace,
+		Colon,
+		Comma,
+		StringLit,
+		IntLit,
+		FloatLit,
+	};
+
+	std::string getTokenName(TokenType t);
+
+	struct Token
+	{
+		TokenType type;
+		std::string content;
+	};
+
+	class Tokenizer
+	{
+	public:
+		void appendInput(std::string s);
+		std::vector<Token> tokenize();
+		std::string getInput();
+	private:
+		std::string input = "";
+	};
+}
author	Dylan <boss@tehbox.org>	2026-06-06 19:36:41 +1200
committer	Dylan <boss@tehbox.org>	2026-06-06 19:36:41 +1200
commit	1d379a5cf34475f66f2ab9359f77dac162c0a40e (patch)
tree	395d54815331fbcad053001bf5cb28fafb69e9d4 /src
parent	46c896bcd78d31130321562b0659e28230261b8e (diff)
download	tehjson-1d379a5cf34475f66f2ab9359f77dac162c0a40e.tar.gz tehjson-1d379a5cf34475f66f2ab9359f77dac162c0a40e.zip