"""unicode.py

 Playing around with strings, bytes, utf8, unicode, and all that in python 3.

 Strings (type <str>) are sequences of characters (usually unicode),
 not sequences of bytes.  The ord() function returns the unicode "code
 point" (i.e. unicode numeric value) of the character.  They are input
 with single or double quotes : 'abc' or "έψιλον".  They can be read
 in from a file with open(filename,'r').read(). Strings have a
 .encode() method which turns them into a sequence of bytes. The
 default encoding is UTF-8, in which unusual characters get stored as
 multiple bytes, but common ASCII characters take one byte.

 Bytes (type <bytes>) are sequences of 8-bit bytes, not unicode
 characters. They can be typed as b'abc' or (for the greek characters
 above) b'\xce\xad\xcf\x88\xce\xb9\xce\xbb\xce\xbf\xce\xbd' where
 something like \xce means is hex byte 0xCE which is 206 in base 10.
 And they can be read from a file which has been opened in binary
 mode, e.g. open(filename,'rb').read(). Bytes have a .decode() method
 which turns them into character strings.

 (Note that in python 2.*, strings were just byte arrays - there were
 no unicode sequences or a bytes type.)

 This code gives some examples. Running it looks like this.

     $ python --version
     Python 3.7.3

     $ python unicode.py 

     -- string -- 
     faces = ' 🤨 😟 '
     type(faces) =  <class 'str'>
     len(faces) =  5
     faces.encode() =  b' \xf0\x9f\xa4\xa8 \xf0\x9f\x98\x9f '
    
     -- s = open(,'r').read() --
     type(s) =  <class 'str'>
     len(s) =  3932
     repr( s[2030:2050] ) = 'es = " 🤨 😟 "       #'
    
     -- bytes -- 
     face_bytes = b' \xf0\x9f\xa4\xa8 \xf0\x9f\x98\x9f '
     type(face_bytes) =  <class 'bytes'>
     len(face_bytes) =  11
     face_bytes.decode() = ' 🤨 😟 '
    
     -- b = open(,'rb').read() --
     type(b) =  <class 'bytes'>
     len(b) =  4022
     repr( b[1900:1930] ) = b"' + epsilon = ' epsilon \xce\xad\xcf\x88\xce\xb9"

     -- face with one raised eyebrow U+1F928 --
     ord('🤨') = 129320 = 0x1f928 = unicode code point
     '🤨'.encode() =  b'\xf0\x9f\xa4\xa8'

     -- non-ascii symbols in variable names --
     έψιλον + ' ' + epsilon = ' epsilon έψιλον'

 (I used the repr() representation function above to avoid printing newlines.)

 *Now* we're having fun.

 Jim Mahoney | cs.marlboro.college | MIT License | Oct 2019

"""
print()

print(" -- string -- ")
faces = " 🤨 😟 "       # five characters: space, face, space, face, space
print(" faces = '{}'".format(faces))
print(" type(faces) = ", type(faces))
print(" len(faces) = ", len(faces))
print(" faces.encode() = ", faces.encode())
print()

print(" -- s = open(,'r').read() --")
s = open('unicode.py', 'r').read()
print(" type(s) = ", type(s))
print(" len(s) = ", len(s))
print(" repr( s[2030:2050] ) = {:s}".format(repr(s[2030:2050])))
print()

print(" -- bytes -- ")
face_bytes = " 🤨 😟 ".encode()
print(" face_bytes = {}".format(face_bytes))
print(" type(face_bytes) = ", type(face_bytes))
print(" len(face_bytes) = ", len(face_bytes))     # lots more than five bytes
print(" face_bytes.decode() = '{}'".format(face_bytes.decode()))
print()

print(" -- b = open(,'rb').read() --")
b = open('unicode.py', 'rb').read()
print(" type(b) = ", type(b))
print(" len(b) = ", len(b))
print(" repr( b[1900:1930] ) = {}".format(repr(b[1900:1930])))
print()

# I like the "face with one eyebrow raised" unicode character,
# U+1F928, also known as the "Colbert emoji".
# See www.fileformat.info/info/unicode/char/1f928/index.htm
# and unicode.org/emoji/charts/full-emoji-list.html

print(" -- face with one raised eyebrow U+1F928 --")
print(" ord('🤨') = {} = {} = unicode code point".format(
      ord('🤨'), hex(ord('🤨'))))
print(" '🤨'.encode() = ", '🤨'.encode())
print()

# You can put "symbol like" characters into variable names,
# but many unicode symbols are not allowed in varaibles.
# See https://docs.python.org/3.3/reference/lexical_analysis.html#identifiers
#
# For example this doesn't work,
# giving "SyntaxError: invalid character in identifier"
# __🤨__ = 3
#
# But you can do the following, which looks Greek to me ...
print(" -- non-ascii symbols in variable names --")
έψιλον = "epsilon"
epsilon = "έψιλον"
print(" έψιλον + ' ' + epsilon = '", έψιλον + ' ' + epsilon + "'")
print()